]> source.dussan.org Git - poi.git/commitdiff
Tweak how we do ooxml properties, and handle hyperlinks for word documents when extra...
authorNick Burch <nick@apache.org>
Wed, 9 Apr 2008 12:22:23 +0000 (12:22 +0000)
committerNick Burch <nick@apache.org>
Wed, 9 Apr 2008 12:22:23 +0000 (12:22 +0000)
git-svn-id: https://svn.apache.org/repos/asf/poi/branches/ooxml@646298 13f79535-47bb-0310-9956-ffa450edef68

src/ooxml/java/org/apache/poi/POIXMLDocument.java
src/ooxml/java/org/apache/poi/POIXMLProperties.java [new file with mode: 0644]
src/ooxml/java/org/apache/poi/POIXMLTextExtractor.java
src/ooxml/java/org/apache/poi/xwpf/XWPFDocument.java
src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java
src/ooxml/testcases/org/apache/poi/xslf/TestXSLFSlideShow.java
src/ooxml/testcases/org/apache/poi/xwpf/TestXWPFDocument.java
src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java
src/scratchpad/testcases/org/apache/poi/hwpf/data/TestDocument.docx [new file with mode: 0755]

index bcc8c0911aa2d8e7d7255d6b81adb9cb99a24835..9fa4789db0e56c472fafc2540f79a590c148d7b1 100644 (file)
@@ -32,9 +32,6 @@ import org.openxml4j.opc.PackageRelationship;
 import org.openxml4j.opc.PackageRelationshipCollection;
 import org.openxml4j.opc.PackageRelationshipTypes;
 import org.openxml4j.opc.PackagingURIHelper;
-import org.openxml4j.opc.internal.PackagePropertiesPart;
-import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.CTProperties;
-import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.PropertiesDocument;
 
 public abstract class POIXMLDocument {
 
@@ -48,6 +45,12 @@ public abstract class POIXMLDocument {
     /** The OPC core Package Part */
     private PackagePart corePart;
     
+    /**
+     * The properties of the OPC package, opened as needed
+     */
+    private POIXMLProperties properties;
+    
+    
     protected POIXMLDocument() {}
     
     protected POIXMLDocument(Package pkg) throws IOException {
@@ -178,28 +181,13 @@ public abstract class POIXMLDocument {
     }
 
        /**
-        * Get the core document properties (core ooxml properties).
-        * TODO: Replace with nice usermodel wrapper
-        * @deprecated To be replaced with a proper user-model style view of the properties
+        * Get the document properties. This gives you access to the
+        *  core ooxml properties, and the extended ooxml properties.
         */
-       public PackagePropertiesPart getCoreProperties() throws OpenXML4JException, IOException {
-               PackagePart propsPart = getSinglePartByRelationType(CORE_PROPERTIES_REL_TYPE);
-               if(propsPart == null) {
-                       return null;
+       public POIXMLProperties getProperties() throws OpenXML4JException, IOException, XmlException {
+               if(properties == null) {
+                       properties = new POIXMLProperties(pkg);
                }
-               return (PackagePropertiesPart)propsPart;
-       }
-       
-       /**
-        * Get the extended document properties (extended ooxml properties)
-        * TODO: Replace with nice usermodel wrapper
-        * @deprecated To be replaced with a proper user-model style view of the properties
-        */
-       public CTProperties getExtendedProperties() throws OpenXML4JException, XmlException, IOException {
-               PackagePart propsPart = getSinglePartByRelationType(EXTENDED_PROPERTIES_REL_TYPE);
-               
-               PropertiesDocument props = PropertiesDocument.Factory.parse(
-                               propsPart.getInputStream());
-               return props.getProperties();
+               return properties;
        }
 }
diff --git a/src/ooxml/java/org/apache/poi/POIXMLProperties.java b/src/ooxml/java/org/apache/poi/POIXMLProperties.java
new file mode 100644 (file)
index 0000000..7806c9b
--- /dev/null
@@ -0,0 +1,124 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi;
+
+import java.io.IOException;
+
+import org.apache.xmlbeans.XmlException;
+import org.openxml4j.exceptions.OpenXML4JException;
+import org.openxml4j.opc.Package;
+import org.openxml4j.opc.PackageRelationshipCollection;
+import org.openxml4j.opc.internal.PackagePropertiesPart;
+import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.CTProperties;
+import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.PropertiesDocument;
+
+/**
+ * Wrapper around the two different kinds of OOXML properties 
+ *  a document can have
+ */
+public class POIXMLProperties {
+       private Package pkg;
+       private CoreProperties core;
+       private ExtendedProperties ext;
+       
+       public POIXMLProperties(Package docPackage) throws IOException, OpenXML4JException, XmlException {
+               this.pkg = docPackage;
+               
+               // Core properties
+               PackageRelationshipCollection coreRel =
+                       pkg.getRelationshipsByType(POIXMLDocument.CORE_PROPERTIES_REL_TYPE);
+               if(coreRel.size() == 1) {
+                       core = new CoreProperties( (PackagePropertiesPart)
+                                       pkg.getPart(coreRel.getRelationship(0)) );
+               } else {
+                       throw new IllegalArgumentException("A document must always have core properties defined!");
+               }
+               
+               // Extended properties
+               PackageRelationshipCollection extRel =
+                       pkg.getRelationshipsByType(POIXMLDocument.EXTENDED_PROPERTIES_REL_TYPE);
+               if(extRel.size() == 1) {
+                       PropertiesDocument props = PropertiesDocument.Factory.parse(
+                                       pkg.getPart( extRel.getRelationship(0) ).getInputStream()
+                       );
+                       ext = new ExtendedProperties(props);
+               } else {
+                       ext = new ExtendedProperties(PropertiesDocument.Factory.newInstance());
+               }
+       }
+       
+       /**
+        * Returns the core document properties
+        */
+       public CoreProperties getCoreProperties() {
+               return core;
+       }
+       
+       /**
+        * Returns the extended document properties
+        */
+       public ExtendedProperties getExtendedProperties() {
+               return ext;
+       }
+       
+       /**
+        * Writes out the ooxml properties into the supplied,
+        *  new Package
+        */
+       public void write(Package pkg) {
+               // TODO
+       }
+       
+       /**
+        * The core document properties
+        */
+       public class CoreProperties {
+               private PackagePropertiesPart part;
+               private CoreProperties(PackagePropertiesPart part) {
+                       this.part = part;
+               }
+               
+               public void setTitle(String title) {
+                       part.setTitleProperty(title);
+               }
+               public String getTitle() {
+                       return part.getTitleProperty().getValue();
+               }
+               
+               public PackagePropertiesPart getUnderlyingProperties() {
+                       return part;
+               }
+       }
+       
+       /**
+        * Extended document properties
+        */
+       public class ExtendedProperties {
+               private PropertiesDocument props;
+               private ExtendedProperties(PropertiesDocument props) {
+                       this.props = props;
+                       
+                       if(props.getProperties() == null) {
+                               props.addNewProperties();
+                       }
+               }
+               
+               public CTProperties getUnderlyingProperties() {
+                       return props.getProperties();
+               }
+       }
+}
index c28eba49da16a3a90b46b58969b25bf8f0d4f6fe..ae8514c278e8675e867978e92a22769e71a6daa0 100644 (file)
 ==================================================================== */
 package org.apache.poi;
 
+import java.io.IOException;
+
+import org.apache.poi.POIXMLProperties.*;
+import org.apache.xmlbeans.XmlException;
+import org.openxml4j.exceptions.OpenXML4JException;
+
 public abstract class POIXMLTextExtractor extends POITextExtractor {
        /** The POIXMLDocument that's open */
        protected POIXMLDocument document;
@@ -28,4 +34,17 @@ public abstract class POIXMLTextExtractor extends POITextExtractor {
                
                this.document = document;
        }
+       
+       /**
+        * Returns the core document properties
+        */
+       public CoreProperties getCoreProperties() throws IOException, OpenXML4JException, XmlException {
+                return document.getProperties().getCoreProperties();
+       }
+       /**
+        * Returns the extended document properties
+        */
+       public ExtendedProperties getExtendedProperties() throws IOException, OpenXML4JException, XmlException {
+               return document.getProperties().getExtendedProperties();
+       }
 }
index 4b54ed86372f6fd80c1cc91d667d204756e66cc8..05b716d752b79987b6e692ec9e6cfe8a7d822307 100644 (file)
@@ -24,6 +24,7 @@ import org.openxml4j.exceptions.InvalidFormatException;
 import org.openxml4j.exceptions.OpenXML4JException;
 import org.openxml4j.opc.Package;
 import org.openxml4j.opc.PackagePart;
+import org.openxml4j.opc.PackageRelationshipCollection;
 import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBody;
 import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTDocument1;
 import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTStyles;
@@ -47,6 +48,7 @@ public class XWPFDocument extends POIXMLDocument {
        public static final String HEADER_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml";
        public static final String STYLES_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml";
        public static final String STYLES_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles";
+       public static final String HYPERLINK_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink"; 
        
        private DocumentDocument wordDoc;
        
@@ -89,4 +91,18 @@ public class XWPFDocument extends POIXMLDocument {
                        StylesDocument.Factory.parse(parts[0].getInputStream());
                return sd.getStyles();
        }
+       
+       /**
+        * Returns all the hyperlink relations for the file.
+        * You'll generally want to get the target to get
+        *  the destination of the hyperlink
+        */
+       public PackageRelationshipCollection getHyperlinks() {
+               try {
+                       return getCorePart().getRelationshipsByType(HYPERLINK_RELATION_TYPE); 
+               } catch(InvalidFormatException e) {
+                       // Should never happen
+                       throw new IllegalStateException(e);
+               }
+       }
 }
index 58fa4839c65d8bb8c14b500cd36d661b53c1a1ee..bd1936d16bce2e320b2d79add41550cdd74fc281 100644 (file)
@@ -16,7 +16,6 @@
 ==================================================================== */
 package org.apache.poi.xwpf.extractor;
 
-import java.io.File;
 import java.io.IOException;
 
 import org.apache.poi.POIXMLDocument;
@@ -25,7 +24,9 @@ import org.apache.poi.xwpf.XWPFDocument;
 import org.apache.xmlbeans.XmlException;
 import org.openxml4j.exceptions.OpenXML4JException;
 import org.openxml4j.opc.Package;
+import org.openxml4j.opc.PackageRelationship;
 import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBody;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHyperlink;
 import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
 import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
 import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText;
@@ -35,6 +36,7 @@ import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText;
  */
 public class XWPFWordExtractor extends POIXMLTextExtractor {
        private XWPFDocument document;
+       private boolean fetchHyperlinks = false;
        
        public XWPFWordExtractor(Package container) throws XmlException, OpenXML4JException, IOException {
                this(new XWPFDocument(container));
@@ -56,6 +58,15 @@ public class XWPFWordExtractor extends POIXMLTextExtractor {
                        ));
                System.out.println(extractor.getText());
        }
+       
+       /**
+        * Should we also fetch the hyperlinks, when fetching 
+        *  the text content? Default is to only output the
+        *  hyperlink label, and not the contents
+        */
+       public void setFetchHyperlinks(boolean fetch) {
+               fetchHyperlinks = fetch;
+       }
 
        public String getText() {
                CTBody body = document.getDocumentBody();
@@ -64,9 +75,10 @@ public class XWPFWordExtractor extends POIXMLTextExtractor {
                // Loop over paragraphs
                CTP[] ps = body.getPArray();
                for (int i = 0; i < ps.length; i++) {
-                       // Loop over ranges
+                       // Loop over ranges and hyperlinks
+                       // TODO - properly intersperce ranges and hyperlinks
                        CTR[] rs = ps[i].getRArray();
-                       for (int j = 0; j < rs.length; j++) {
+                       for(int j = 0; j < rs.length; j++) {
                                // Loop over text runs
                                CTText[] texts = rs[j].getTArray();
                                for (int k = 0; k < texts.length; k++) {
@@ -75,6 +87,26 @@ public class XWPFWordExtractor extends POIXMLTextExtractor {
                                        );
                                }
                        }
+                       
+                       CTHyperlink[] hls =  ps[i].getHyperlinkArray();
+                       for(CTHyperlink hl : hls) {
+                               for(CTR r : hl.getRArray()) {
+                                       for(CTText txt : r.getTArray()) {
+                                               text.append(txt.getStringValue());
+                                       }
+                               }
+                               if(fetchHyperlinks) {
+                                       String id = hl.getId();
+                                       if(id != null) {
+                                               PackageRelationship hlRel =
+                                                       document.getHyperlinks().getRelationshipByID(id);
+                                               if(hlRel != null) {
+                                                       text.append(" <" + hlRel.getTargetURI().toString() + ">");
+                                               }
+                                       }
+                               }
+                       }
+                       
                        // New line after each paragraph.
                        text.append("\n");
                }
index c25d08a90ccfc366bcb936276a7e27611e6ad282..682fb97574185c95dde13fc01675d4d150cd1dbb 100644 (file)
@@ -46,7 +46,7 @@ public class TestXSLFSlideShow extends TestCase {
                        if(part.getContentType().equals(XSLFSlideShow.MAIN_CONTENT_TYPE)) {
                                found = true;
                        }
-                       System.out.println(part);
+                       //System.out.println(part);
                }
                assertTrue(found);
        }
@@ -110,14 +110,14 @@ public class TestXSLFSlideShow extends TestCase {
        public void testMetadataBasics() throws Exception {
                XSLFSlideShow xml = new XSLFSlideShow(sampleFile);
                
-               assertNotNull(xml.getCoreProperties());
-               assertNotNull(xml.getExtendedProperties());
+               assertNotNull(xml.getProperties().getCoreProperties());
+               assertNotNull(xml.getProperties().getExtendedProperties());
                
-               assertEquals("Microsoft Office PowerPoint", xml.getExtendedProperties().getApplication());
-               assertEquals(0, xml.getExtendedProperties().getCharacters());
-               assertEquals(0, xml.getExtendedProperties().getLines());
+               assertEquals("Microsoft Office PowerPoint", xml.getProperties().getExtendedProperties().getUnderlyingProperties().getApplication());
+               assertEquals(0, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getCharacters());
+               assertEquals(0, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getLines());
                
-               assertEquals(null, xml.getCoreProperties().getTitleProperty().getValue());
-               assertEquals(null, xml.getCoreProperties().getSubjectProperty().getValue());
+               assertEquals(null, xml.getProperties().getCoreProperties().getTitle());
+               assertEquals(null, xml.getProperties().getCoreProperties().getUnderlyingProperties().getSubjectProperty().getValue());
        }
 }
index 94127193feffcfc46fee5c310e84adf190c5c73b..0bd24a20721ca029d2bc1fb3b8bf3d894642e1d3 100644 (file)
@@ -92,29 +92,29 @@ public class TestXWPFDocument extends TestCase {
                XWPFDocument xml = new XWPFDocument(
                                POIXMLDocument.openPackage(sampleFile.toString())
                );
-               assertNotNull(xml.getCoreProperties());
-               assertNotNull(xml.getExtendedProperties());
+               assertNotNull(xml.getProperties().getCoreProperties());
+               assertNotNull(xml.getProperties().getExtendedProperties());
                
-               assertEquals("Microsoft Office Word", xml.getExtendedProperties().getApplication());
-               assertEquals(1315, xml.getExtendedProperties().getCharacters());
-               assertEquals(10, xml.getExtendedProperties().getLines());
+               assertEquals("Microsoft Office Word", xml.getProperties().getExtendedProperties().getUnderlyingProperties().getApplication());
+               assertEquals(1315, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getCharacters());
+               assertEquals(10, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getLines());
                
-               assertEquals(null, xml.getCoreProperties().getTitleProperty().getValue());
-               assertEquals(null, xml.getCoreProperties().getSubjectProperty().getValue());
+               assertEquals(null, xml.getProperties().getCoreProperties().getTitle());
+               assertEquals(null, xml.getProperties().getCoreProperties().getUnderlyingProperties().getSubjectProperty().getValue());
        }
        
        public void testMetadataComplex() throws Exception {
                XWPFDocument xml = new XWPFDocument(
                                POIXMLDocument.openPackage(complexFile.toString())
                );
-               assertNotNull(xml.getCoreProperties());
-               assertNotNull(xml.getExtendedProperties());
+               assertNotNull(xml.getProperties().getCoreProperties());
+               assertNotNull(xml.getProperties().getExtendedProperties());
                
-               assertEquals("Microsoft Office Outlook", xml.getExtendedProperties().getApplication());
-               assertEquals(5184, xml.getExtendedProperties().getCharacters());
-               assertEquals(0, xml.getExtendedProperties().getLines());
+               assertEquals("Microsoft Office Outlook", xml.getProperties().getExtendedProperties().getUnderlyingProperties().getApplication());
+               assertEquals(5184, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getCharacters());
+               assertEquals(0, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getLines());
                
-               assertEquals(" ", xml.getCoreProperties().getTitleProperty().getValue());
-               assertEquals(" ", xml.getCoreProperties().getSubjectProperty().getValue());
+               assertEquals(" ", xml.getProperties().getCoreProperties().getTitle());
+               assertEquals(" ", xml.getProperties().getCoreProperties().getUnderlyingProperties().getSubjectProperty().getValue());
        }
 }
index 9a1f239bcc4dfa3f31f2bb2c49c5a68fddcf4c5f..e62dd66ce7315166a0b98b202628f45d65fec1df 100644 (file)
@@ -37,6 +37,12 @@ public class TestXWPFWordExtractor extends TestCase {
         */
        private XWPFDocument xmlB;
        private File fileB;
+       
+       /**
+        * File with hyperlinks
+        */
+       private XWPFDocument xmlC;
+       private File fileC;
 
        protected void setUp() throws Exception {
                super.setUp();
@@ -49,11 +55,17 @@ public class TestXWPFWordExtractor extends TestCase {
                                System.getProperty("HWPF.testdata.path") +
                                File.separator + "IllustrativeCases.docx"
                );
+               fileC = new File(
+                               System.getProperty("HWPF.testdata.path") +
+                               File.separator + "TestDocument.docx"
+               );
                assertTrue(fileA.exists());
                assertTrue(fileB.exists());
+               assertTrue(fileC.exists());
                
                xmlA = new XWPFDocument(POIXMLDocument.openPackage(fileA.toString()));
                xmlB = new XWPFDocument(POIXMLDocument.openPackage(fileB.toString()));
+               xmlC = new XWPFDocument(POIXMLDocument.openPackage(fileC.toString()));
        }
 
        /**
@@ -117,4 +129,32 @@ public class TestXWPFWordExtractor extends TestCase {
                }
                assertEquals(79, ps);
        }
+       
+       public void testGetWithHyperlinks() throws Exception {
+               XWPFWordExtractor extractor = 
+                       new XWPFWordExtractor(xmlC);
+               extractor.getText();
+               extractor.setFetchHyperlinks(true);
+               extractor.getText();
+
+               // Now check contents
+               // TODO - fix once correctly handling contents
+               extractor.setFetchHyperlinks(false);
+               assertEquals(
+//                             "This is a test document\nThis bit is in bold and italic\n" +
+//                             "Back to normal\nWe have a hyperlink here, and another.\n",
+                               "This is a test document\nThis bit is in bold and italic\n" +
+                               "Back to normal\nWe have a  here, and .hyperlinkanother\n",
+                               extractor.getText()
+               );
+               
+               extractor.setFetchHyperlinks(true);
+               assertEquals(
+//                             "This is a test document\nThis bit is in bold and italic\n" +
+//                             "Back to normal\nWe have a hyperlink here, and another.\n",
+                               "This is a test document\nThis bit is in bold and italic\n" +
+                               "Back to normal\nWe have a  here, and .hyperlink <http://poi.apache.org/>another\n",
+                               extractor.getText()
+               );
+       }
 }
diff --git a/src/scratchpad/testcases/org/apache/poi/hwpf/data/TestDocument.docx b/src/scratchpad/testcases/org/apache/poi/hwpf/data/TestDocument.docx
new file mode 100755 (executable)
index 0000000..058dec5
Binary files /dev/null and b/src/scratchpad/testcases/org/apache/poi/hwpf/data/TestDocument.docx differ