import org.openxml4j.opc.PackageRelationshipCollection;
import org.openxml4j.opc.PackageRelationshipTypes;
import org.openxml4j.opc.PackagingURIHelper;
-import org.openxml4j.opc.internal.PackagePropertiesPart;
-import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.CTProperties;
-import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.PropertiesDocument;
public abstract class POIXMLDocument {
/** The OPC core Package Part */
private PackagePart corePart;
+ /**
+ * The properties of the OPC package, opened as needed
+ */
+ private POIXMLProperties properties;
+
+
protected POIXMLDocument() {}
protected POIXMLDocument(Package pkg) throws IOException {
}
/**
- * Get the core document properties (core ooxml properties).
- * TODO: Replace with nice usermodel wrapper
- * @deprecated To be replaced with a proper user-model style view of the properties
+ * Get the document properties. This gives you access to the
+ * core ooxml properties, and the extended ooxml properties.
*/
- public PackagePropertiesPart getCoreProperties() throws OpenXML4JException, IOException {
- PackagePart propsPart = getSinglePartByRelationType(CORE_PROPERTIES_REL_TYPE);
- if(propsPart == null) {
- return null;
+ public POIXMLProperties getProperties() throws OpenXML4JException, IOException, XmlException {
+ if(properties == null) {
+ properties = new POIXMLProperties(pkg);
}
- return (PackagePropertiesPart)propsPart;
- }
-
- /**
- * Get the extended document properties (extended ooxml properties)
- * TODO: Replace with nice usermodel wrapper
- * @deprecated To be replaced with a proper user-model style view of the properties
- */
- public CTProperties getExtendedProperties() throws OpenXML4JException, XmlException, IOException {
- PackagePart propsPart = getSinglePartByRelationType(EXTENDED_PROPERTIES_REL_TYPE);
-
- PropertiesDocument props = PropertiesDocument.Factory.parse(
- propsPart.getInputStream());
- return props.getProperties();
+ return properties;
}
}
--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi;
+
+import java.io.IOException;
+
+import org.apache.xmlbeans.XmlException;
+import org.openxml4j.exceptions.OpenXML4JException;
+import org.openxml4j.opc.Package;
+import org.openxml4j.opc.PackageRelationshipCollection;
+import org.openxml4j.opc.internal.PackagePropertiesPart;
+import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.CTProperties;
+import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.PropertiesDocument;
+
+/**
+ * Wrapper around the two different kinds of OOXML properties
+ * a document can have
+ */
+public class POIXMLProperties {
+ private Package pkg;
+ private CoreProperties core;
+ private ExtendedProperties ext;
+
+ public POIXMLProperties(Package docPackage) throws IOException, OpenXML4JException, XmlException {
+ this.pkg = docPackage;
+
+ // Core properties
+ PackageRelationshipCollection coreRel =
+ pkg.getRelationshipsByType(POIXMLDocument.CORE_PROPERTIES_REL_TYPE);
+ if(coreRel.size() == 1) {
+ core = new CoreProperties( (PackagePropertiesPart)
+ pkg.getPart(coreRel.getRelationship(0)) );
+ } else {
+ throw new IllegalArgumentException("A document must always have core properties defined!");
+ }
+
+ // Extended properties
+ PackageRelationshipCollection extRel =
+ pkg.getRelationshipsByType(POIXMLDocument.EXTENDED_PROPERTIES_REL_TYPE);
+ if(extRel.size() == 1) {
+ PropertiesDocument props = PropertiesDocument.Factory.parse(
+ pkg.getPart( extRel.getRelationship(0) ).getInputStream()
+ );
+ ext = new ExtendedProperties(props);
+ } else {
+ ext = new ExtendedProperties(PropertiesDocument.Factory.newInstance());
+ }
+ }
+
+ /**
+ * Returns the core document properties
+ */
+ public CoreProperties getCoreProperties() {
+ return core;
+ }
+
+ /**
+ * Returns the extended document properties
+ */
+ public ExtendedProperties getExtendedProperties() {
+ return ext;
+ }
+
+ /**
+ * Writes out the ooxml properties into the supplied,
+ * new Package
+ */
+ public void write(Package pkg) {
+ // TODO
+ }
+
+ /**
+ * The core document properties
+ */
+ public class CoreProperties {
+ private PackagePropertiesPart part;
+ private CoreProperties(PackagePropertiesPart part) {
+ this.part = part;
+ }
+
+ public void setTitle(String title) {
+ part.setTitleProperty(title);
+ }
+ public String getTitle() {
+ return part.getTitleProperty().getValue();
+ }
+
+ public PackagePropertiesPart getUnderlyingProperties() {
+ return part;
+ }
+ }
+
+ /**
+ * Extended document properties
+ */
+ public class ExtendedProperties {
+ private PropertiesDocument props;
+ private ExtendedProperties(PropertiesDocument props) {
+ this.props = props;
+
+ if(props.getProperties() == null) {
+ props.addNewProperties();
+ }
+ }
+
+ public CTProperties getUnderlyingProperties() {
+ return props.getProperties();
+ }
+ }
+}
==================================================================== */
package org.apache.poi;
+import java.io.IOException;
+
+import org.apache.poi.POIXMLProperties.*;
+import org.apache.xmlbeans.XmlException;
+import org.openxml4j.exceptions.OpenXML4JException;
+
public abstract class POIXMLTextExtractor extends POITextExtractor {
/** The POIXMLDocument that's open */
protected POIXMLDocument document;
this.document = document;
}
+
+ /**
+ * Returns the core document properties
+ */
+ public CoreProperties getCoreProperties() throws IOException, OpenXML4JException, XmlException {
+ return document.getProperties().getCoreProperties();
+ }
+ /**
+ * Returns the extended document properties
+ */
+ public ExtendedProperties getExtendedProperties() throws IOException, OpenXML4JException, XmlException {
+ return document.getProperties().getExtendedProperties();
+ }
}
import org.openxml4j.exceptions.OpenXML4JException;
import org.openxml4j.opc.Package;
import org.openxml4j.opc.PackagePart;
+import org.openxml4j.opc.PackageRelationshipCollection;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBody;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTDocument1;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTStyles;
public static final String HEADER_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml";
public static final String STYLES_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml";
public static final String STYLES_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles";
+ public static final String HYPERLINK_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink";
private DocumentDocument wordDoc;
StylesDocument.Factory.parse(parts[0].getInputStream());
return sd.getStyles();
}
+
+ /**
+ * Returns all the hyperlink relations for the file.
+ * You'll generally want to get the target to get
+ * the destination of the hyperlink
+ */
+ public PackageRelationshipCollection getHyperlinks() {
+ try {
+ return getCorePart().getRelationshipsByType(HYPERLINK_RELATION_TYPE);
+ } catch(InvalidFormatException e) {
+ // Should never happen
+ throw new IllegalStateException(e);
+ }
+ }
}
==================================================================== */
package org.apache.poi.xwpf.extractor;
-import java.io.File;
import java.io.IOException;
import org.apache.poi.POIXMLDocument;
import org.apache.xmlbeans.XmlException;
import org.openxml4j.exceptions.OpenXML4JException;
import org.openxml4j.opc.Package;
+import org.openxml4j.opc.PackageRelationship;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBody;
+import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHyperlink;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText;
*/
public class XWPFWordExtractor extends POIXMLTextExtractor {
private XWPFDocument document;
+ private boolean fetchHyperlinks = false;
public XWPFWordExtractor(Package container) throws XmlException, OpenXML4JException, IOException {
this(new XWPFDocument(container));
));
System.out.println(extractor.getText());
}
+
+ /**
+ * Should we also fetch the hyperlinks, when fetching
+ * the text content? Default is to only output the
+ * hyperlink label, and not the contents
+ */
+ public void setFetchHyperlinks(boolean fetch) {
+ fetchHyperlinks = fetch;
+ }
public String getText() {
CTBody body = document.getDocumentBody();
// Loop over paragraphs
CTP[] ps = body.getPArray();
for (int i = 0; i < ps.length; i++) {
- // Loop over ranges
+ // Loop over ranges and hyperlinks
+ // TODO - properly intersperce ranges and hyperlinks
CTR[] rs = ps[i].getRArray();
- for (int j = 0; j < rs.length; j++) {
+ for(int j = 0; j < rs.length; j++) {
// Loop over text runs
CTText[] texts = rs[j].getTArray();
for (int k = 0; k < texts.length; k++) {
);
}
}
+
+ CTHyperlink[] hls = ps[i].getHyperlinkArray();
+ for(CTHyperlink hl : hls) {
+ for(CTR r : hl.getRArray()) {
+ for(CTText txt : r.getTArray()) {
+ text.append(txt.getStringValue());
+ }
+ }
+ if(fetchHyperlinks) {
+ String id = hl.getId();
+ if(id != null) {
+ PackageRelationship hlRel =
+ document.getHyperlinks().getRelationshipByID(id);
+ if(hlRel != null) {
+ text.append(" <" + hlRel.getTargetURI().toString() + ">");
+ }
+ }
+ }
+ }
+
// New line after each paragraph.
text.append("\n");
}
if(part.getContentType().equals(XSLFSlideShow.MAIN_CONTENT_TYPE)) {
found = true;
}
- System.out.println(part);
+ //System.out.println(part);
}
assertTrue(found);
}
public void testMetadataBasics() throws Exception {
XSLFSlideShow xml = new XSLFSlideShow(sampleFile);
- assertNotNull(xml.getCoreProperties());
- assertNotNull(xml.getExtendedProperties());
+ assertNotNull(xml.getProperties().getCoreProperties());
+ assertNotNull(xml.getProperties().getExtendedProperties());
- assertEquals("Microsoft Office PowerPoint", xml.getExtendedProperties().getApplication());
- assertEquals(0, xml.getExtendedProperties().getCharacters());
- assertEquals(0, xml.getExtendedProperties().getLines());
+ assertEquals("Microsoft Office PowerPoint", xml.getProperties().getExtendedProperties().getUnderlyingProperties().getApplication());
+ assertEquals(0, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getCharacters());
+ assertEquals(0, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getLines());
- assertEquals(null, xml.getCoreProperties().getTitleProperty().getValue());
- assertEquals(null, xml.getCoreProperties().getSubjectProperty().getValue());
+ assertEquals(null, xml.getProperties().getCoreProperties().getTitle());
+ assertEquals(null, xml.getProperties().getCoreProperties().getUnderlyingProperties().getSubjectProperty().getValue());
}
}
XWPFDocument xml = new XWPFDocument(
POIXMLDocument.openPackage(sampleFile.toString())
);
- assertNotNull(xml.getCoreProperties());
- assertNotNull(xml.getExtendedProperties());
+ assertNotNull(xml.getProperties().getCoreProperties());
+ assertNotNull(xml.getProperties().getExtendedProperties());
- assertEquals("Microsoft Office Word", xml.getExtendedProperties().getApplication());
- assertEquals(1315, xml.getExtendedProperties().getCharacters());
- assertEquals(10, xml.getExtendedProperties().getLines());
+ assertEquals("Microsoft Office Word", xml.getProperties().getExtendedProperties().getUnderlyingProperties().getApplication());
+ assertEquals(1315, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getCharacters());
+ assertEquals(10, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getLines());
- assertEquals(null, xml.getCoreProperties().getTitleProperty().getValue());
- assertEquals(null, xml.getCoreProperties().getSubjectProperty().getValue());
+ assertEquals(null, xml.getProperties().getCoreProperties().getTitle());
+ assertEquals(null, xml.getProperties().getCoreProperties().getUnderlyingProperties().getSubjectProperty().getValue());
}
public void testMetadataComplex() throws Exception {
XWPFDocument xml = new XWPFDocument(
POIXMLDocument.openPackage(complexFile.toString())
);
- assertNotNull(xml.getCoreProperties());
- assertNotNull(xml.getExtendedProperties());
+ assertNotNull(xml.getProperties().getCoreProperties());
+ assertNotNull(xml.getProperties().getExtendedProperties());
- assertEquals("Microsoft Office Outlook", xml.getExtendedProperties().getApplication());
- assertEquals(5184, xml.getExtendedProperties().getCharacters());
- assertEquals(0, xml.getExtendedProperties().getLines());
+ assertEquals("Microsoft Office Outlook", xml.getProperties().getExtendedProperties().getUnderlyingProperties().getApplication());
+ assertEquals(5184, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getCharacters());
+ assertEquals(0, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getLines());
- assertEquals(" ", xml.getCoreProperties().getTitleProperty().getValue());
- assertEquals(" ", xml.getCoreProperties().getSubjectProperty().getValue());
+ assertEquals(" ", xml.getProperties().getCoreProperties().getTitle());
+ assertEquals(" ", xml.getProperties().getCoreProperties().getUnderlyingProperties().getSubjectProperty().getValue());
}
}
*/
private XWPFDocument xmlB;
private File fileB;
+
+ /**
+ * File with hyperlinks
+ */
+ private XWPFDocument xmlC;
+ private File fileC;
protected void setUp() throws Exception {
super.setUp();
System.getProperty("HWPF.testdata.path") +
File.separator + "IllustrativeCases.docx"
);
+ fileC = new File(
+ System.getProperty("HWPF.testdata.path") +
+ File.separator + "TestDocument.docx"
+ );
assertTrue(fileA.exists());
assertTrue(fileB.exists());
+ assertTrue(fileC.exists());
xmlA = new XWPFDocument(POIXMLDocument.openPackage(fileA.toString()));
xmlB = new XWPFDocument(POIXMLDocument.openPackage(fileB.toString()));
+ xmlC = new XWPFDocument(POIXMLDocument.openPackage(fileC.toString()));
}
/**
}
assertEquals(79, ps);
}
+
+ public void testGetWithHyperlinks() throws Exception {
+ XWPFWordExtractor extractor =
+ new XWPFWordExtractor(xmlC);
+ extractor.getText();
+ extractor.setFetchHyperlinks(true);
+ extractor.getText();
+
+ // Now check contents
+ // TODO - fix once correctly handling contents
+ extractor.setFetchHyperlinks(false);
+ assertEquals(
+// "This is a test document\nThis bit is in bold and italic\n" +
+// "Back to normal\nWe have a hyperlink here, and another.\n",
+ "This is a test document\nThis bit is in bold and italic\n" +
+ "Back to normal\nWe have a here, and .hyperlinkanother\n",
+ extractor.getText()
+ );
+
+ extractor.setFetchHyperlinks(true);
+ assertEquals(
+// "This is a test document\nThis bit is in bold and italic\n" +
+// "Back to normal\nWe have a hyperlink here, and another.\n",
+ "This is a test document\nThis bit is in bold and italic\n" +
+ "Back to normal\nWe have a here, and .hyperlink <http://poi.apache.org/>another\n",
+ extractor.getText()
+ );
+ }
}