Browse Source

Tweak how we do ooxml properties, and handle hyperlinks for word documents when extracting

git-svn-id: https://svn.apache.org/repos/asf/poi/branches/ooxml@646298 13f79535-47bb-0310-9956-ffa450edef68
tags/REL_3_5_BETA2
Nick Burch 16 years ago
parent
commit
b2e48a2767

+ 12
- 24
src/ooxml/java/org/apache/poi/POIXMLDocument.java View File

@@ -32,9 +32,6 @@ import org.openxml4j.opc.PackageRelationship;
import org.openxml4j.opc.PackageRelationshipCollection;
import org.openxml4j.opc.PackageRelationshipTypes;
import org.openxml4j.opc.PackagingURIHelper;
import org.openxml4j.opc.internal.PackagePropertiesPart;
import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.CTProperties;
import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.PropertiesDocument;

public abstract class POIXMLDocument {

@@ -48,6 +45,12 @@ public abstract class POIXMLDocument {
/** The OPC core Package Part */
private PackagePart corePart;
/**
* The properties of the OPC package, opened as needed
*/
private POIXMLProperties properties;
protected POIXMLDocument() {}
protected POIXMLDocument(Package pkg) throws IOException {
@@ -178,28 +181,13 @@ public abstract class POIXMLDocument {
}

/**
* Get the core document properties (core ooxml properties).
* TODO: Replace with nice usermodel wrapper
* @deprecated To be replaced with a proper user-model style view of the properties
* Get the document properties. This gives you access to the
* core ooxml properties, and the extended ooxml properties.
*/
public PackagePropertiesPart getCoreProperties() throws OpenXML4JException, IOException {
PackagePart propsPart = getSinglePartByRelationType(CORE_PROPERTIES_REL_TYPE);
if(propsPart == null) {
return null;
public POIXMLProperties getProperties() throws OpenXML4JException, IOException, XmlException {
if(properties == null) {
properties = new POIXMLProperties(pkg);
}
return (PackagePropertiesPart)propsPart;
}
/**
* Get the extended document properties (extended ooxml properties)
* TODO: Replace with nice usermodel wrapper
* @deprecated To be replaced with a proper user-model style view of the properties
*/
public CTProperties getExtendedProperties() throws OpenXML4JException, XmlException, IOException {
PackagePart propsPart = getSinglePartByRelationType(EXTENDED_PROPERTIES_REL_TYPE);
PropertiesDocument props = PropertiesDocument.Factory.parse(
propsPart.getInputStream());
return props.getProperties();
return properties;
}
}

+ 124
- 0
src/ooxml/java/org/apache/poi/POIXMLProperties.java View File

@@ -0,0 +1,124 @@
/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi;

import java.io.IOException;

import org.apache.xmlbeans.XmlException;
import org.openxml4j.exceptions.OpenXML4JException;
import org.openxml4j.opc.Package;
import org.openxml4j.opc.PackageRelationshipCollection;
import org.openxml4j.opc.internal.PackagePropertiesPart;
import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.CTProperties;
import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.PropertiesDocument;

/**
* Wrapper around the two different kinds of OOXML properties
* a document can have
*/
public class POIXMLProperties {
private Package pkg;
private CoreProperties core;
private ExtendedProperties ext;
public POIXMLProperties(Package docPackage) throws IOException, OpenXML4JException, XmlException {
this.pkg = docPackage;
// Core properties
PackageRelationshipCollection coreRel =
pkg.getRelationshipsByType(POIXMLDocument.CORE_PROPERTIES_REL_TYPE);
if(coreRel.size() == 1) {
core = new CoreProperties( (PackagePropertiesPart)
pkg.getPart(coreRel.getRelationship(0)) );
} else {
throw new IllegalArgumentException("A document must always have core properties defined!");
}
// Extended properties
PackageRelationshipCollection extRel =
pkg.getRelationshipsByType(POIXMLDocument.EXTENDED_PROPERTIES_REL_TYPE);
if(extRel.size() == 1) {
PropertiesDocument props = PropertiesDocument.Factory.parse(
pkg.getPart( extRel.getRelationship(0) ).getInputStream()
);
ext = new ExtendedProperties(props);
} else {
ext = new ExtendedProperties(PropertiesDocument.Factory.newInstance());
}
}
/**
* Returns the core document properties
*/
public CoreProperties getCoreProperties() {
return core;
}
/**
* Returns the extended document properties
*/
public ExtendedProperties getExtendedProperties() {
return ext;
}
/**
* Writes out the ooxml properties into the supplied,
* new Package
*/
public void write(Package pkg) {
// TODO
}
/**
* The core document properties
*/
public class CoreProperties {
private PackagePropertiesPart part;
private CoreProperties(PackagePropertiesPart part) {
this.part = part;
}
public void setTitle(String title) {
part.setTitleProperty(title);
}
public String getTitle() {
return part.getTitleProperty().getValue();
}
public PackagePropertiesPart getUnderlyingProperties() {
return part;
}
}
/**
* Extended document properties
*/
public class ExtendedProperties {
private PropertiesDocument props;
private ExtendedProperties(PropertiesDocument props) {
this.props = props;
if(props.getProperties() == null) {
props.addNewProperties();
}
}
public CTProperties getUnderlyingProperties() {
return props.getProperties();
}
}
}

+ 19
- 0
src/ooxml/java/org/apache/poi/POIXMLTextExtractor.java View File

@@ -16,6 +16,12 @@
==================================================================== */
package org.apache.poi;

import java.io.IOException;

import org.apache.poi.POIXMLProperties.*;
import org.apache.xmlbeans.XmlException;
import org.openxml4j.exceptions.OpenXML4JException;

public abstract class POIXMLTextExtractor extends POITextExtractor {
/** The POIXMLDocument that's open */
protected POIXMLDocument document;
@@ -28,4 +34,17 @@ public abstract class POIXMLTextExtractor extends POITextExtractor {
this.document = document;
}
/**
* Returns the core document properties
*/
public CoreProperties getCoreProperties() throws IOException, OpenXML4JException, XmlException {
return document.getProperties().getCoreProperties();
}
/**
* Returns the extended document properties
*/
public ExtendedProperties getExtendedProperties() throws IOException, OpenXML4JException, XmlException {
return document.getProperties().getExtendedProperties();
}
}

+ 16
- 0
src/ooxml/java/org/apache/poi/xwpf/XWPFDocument.java View File

@@ -24,6 +24,7 @@ import org.openxml4j.exceptions.InvalidFormatException;
import org.openxml4j.exceptions.OpenXML4JException;
import org.openxml4j.opc.Package;
import org.openxml4j.opc.PackagePart;
import org.openxml4j.opc.PackageRelationshipCollection;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBody;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTDocument1;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTStyles;
@@ -47,6 +48,7 @@ public class XWPFDocument extends POIXMLDocument {
public static final String HEADER_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml";
public static final String STYLES_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml";
public static final String STYLES_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles";
public static final String HYPERLINK_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink";
private DocumentDocument wordDoc;
@@ -89,4 +91,18 @@ public class XWPFDocument extends POIXMLDocument {
StylesDocument.Factory.parse(parts[0].getInputStream());
return sd.getStyles();
}
/**
* Returns all the hyperlink relations for the file.
* You'll generally want to get the target to get
* the destination of the hyperlink
*/
public PackageRelationshipCollection getHyperlinks() {
try {
return getCorePart().getRelationshipsByType(HYPERLINK_RELATION_TYPE);
} catch(InvalidFormatException e) {
// Should never happen
throw new IllegalStateException(e);
}
}
}

+ 35
- 3
src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java View File

@@ -16,7 +16,6 @@
==================================================================== */
package org.apache.poi.xwpf.extractor;

import java.io.File;
import java.io.IOException;

import org.apache.poi.POIXMLDocument;
@@ -25,7 +24,9 @@ import org.apache.poi.xwpf.XWPFDocument;
import org.apache.xmlbeans.XmlException;
import org.openxml4j.exceptions.OpenXML4JException;
import org.openxml4j.opc.Package;
import org.openxml4j.opc.PackageRelationship;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBody;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHyperlink;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText;
@@ -35,6 +36,7 @@ import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText;
*/
public class XWPFWordExtractor extends POIXMLTextExtractor {
private XWPFDocument document;
private boolean fetchHyperlinks = false;
public XWPFWordExtractor(Package container) throws XmlException, OpenXML4JException, IOException {
this(new XWPFDocument(container));
@@ -56,6 +58,15 @@ public class XWPFWordExtractor extends POIXMLTextExtractor {
));
System.out.println(extractor.getText());
}
/**
* Should we also fetch the hyperlinks, when fetching
* the text content? Default is to only output the
* hyperlink label, and not the contents
*/
public void setFetchHyperlinks(boolean fetch) {
fetchHyperlinks = fetch;
}

public String getText() {
CTBody body = document.getDocumentBody();
@@ -64,9 +75,10 @@ public class XWPFWordExtractor extends POIXMLTextExtractor {
// Loop over paragraphs
CTP[] ps = body.getPArray();
for (int i = 0; i < ps.length; i++) {
// Loop over ranges
// Loop over ranges and hyperlinks
// TODO - properly intersperce ranges and hyperlinks
CTR[] rs = ps[i].getRArray();
for (int j = 0; j < rs.length; j++) {
for(int j = 0; j < rs.length; j++) {
// Loop over text runs
CTText[] texts = rs[j].getTArray();
for (int k = 0; k < texts.length; k++) {
@@ -75,6 +87,26 @@ public class XWPFWordExtractor extends POIXMLTextExtractor {
);
}
}
CTHyperlink[] hls = ps[i].getHyperlinkArray();
for(CTHyperlink hl : hls) {
for(CTR r : hl.getRArray()) {
for(CTText txt : r.getTArray()) {
text.append(txt.getStringValue());
}
}
if(fetchHyperlinks) {
String id = hl.getId();
if(id != null) {
PackageRelationship hlRel =
document.getHyperlinks().getRelationshipByID(id);
if(hlRel != null) {
text.append(" <" + hlRel.getTargetURI().toString() + ">");
}
}
}
}
// New line after each paragraph.
text.append("\n");
}

+ 8
- 8
src/ooxml/testcases/org/apache/poi/xslf/TestXSLFSlideShow.java View File

@@ -46,7 +46,7 @@ public class TestXSLFSlideShow extends TestCase {
if(part.getContentType().equals(XSLFSlideShow.MAIN_CONTENT_TYPE)) {
found = true;
}
System.out.println(part);
//System.out.println(part);
}
assertTrue(found);
}
@@ -110,14 +110,14 @@ public class TestXSLFSlideShow extends TestCase {
public void testMetadataBasics() throws Exception {
XSLFSlideShow xml = new XSLFSlideShow(sampleFile);
assertNotNull(xml.getCoreProperties());
assertNotNull(xml.getExtendedProperties());
assertNotNull(xml.getProperties().getCoreProperties());
assertNotNull(xml.getProperties().getExtendedProperties());
assertEquals("Microsoft Office PowerPoint", xml.getExtendedProperties().getApplication());
assertEquals(0, xml.getExtendedProperties().getCharacters());
assertEquals(0, xml.getExtendedProperties().getLines());
assertEquals("Microsoft Office PowerPoint", xml.getProperties().getExtendedProperties().getUnderlyingProperties().getApplication());
assertEquals(0, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getCharacters());
assertEquals(0, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getLines());
assertEquals(null, xml.getCoreProperties().getTitleProperty().getValue());
assertEquals(null, xml.getCoreProperties().getSubjectProperty().getValue());
assertEquals(null, xml.getProperties().getCoreProperties().getTitle());
assertEquals(null, xml.getProperties().getCoreProperties().getUnderlyingProperties().getSubjectProperty().getValue());
}
}

+ 14
- 14
src/ooxml/testcases/org/apache/poi/xwpf/TestXWPFDocument.java View File

@@ -92,29 +92,29 @@ public class TestXWPFDocument extends TestCase {
XWPFDocument xml = new XWPFDocument(
POIXMLDocument.openPackage(sampleFile.toString())
);
assertNotNull(xml.getCoreProperties());
assertNotNull(xml.getExtendedProperties());
assertNotNull(xml.getProperties().getCoreProperties());
assertNotNull(xml.getProperties().getExtendedProperties());
assertEquals("Microsoft Office Word", xml.getExtendedProperties().getApplication());
assertEquals(1315, xml.getExtendedProperties().getCharacters());
assertEquals(10, xml.getExtendedProperties().getLines());
assertEquals("Microsoft Office Word", xml.getProperties().getExtendedProperties().getUnderlyingProperties().getApplication());
assertEquals(1315, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getCharacters());
assertEquals(10, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getLines());
assertEquals(null, xml.getCoreProperties().getTitleProperty().getValue());
assertEquals(null, xml.getCoreProperties().getSubjectProperty().getValue());
assertEquals(null, xml.getProperties().getCoreProperties().getTitle());
assertEquals(null, xml.getProperties().getCoreProperties().getUnderlyingProperties().getSubjectProperty().getValue());
}
public void testMetadataComplex() throws Exception {
XWPFDocument xml = new XWPFDocument(
POIXMLDocument.openPackage(complexFile.toString())
);
assertNotNull(xml.getCoreProperties());
assertNotNull(xml.getExtendedProperties());
assertNotNull(xml.getProperties().getCoreProperties());
assertNotNull(xml.getProperties().getExtendedProperties());
assertEquals("Microsoft Office Outlook", xml.getExtendedProperties().getApplication());
assertEquals(5184, xml.getExtendedProperties().getCharacters());
assertEquals(0, xml.getExtendedProperties().getLines());
assertEquals("Microsoft Office Outlook", xml.getProperties().getExtendedProperties().getUnderlyingProperties().getApplication());
assertEquals(5184, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getCharacters());
assertEquals(0, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getLines());
assertEquals(" ", xml.getCoreProperties().getTitleProperty().getValue());
assertEquals(" ", xml.getCoreProperties().getSubjectProperty().getValue());
assertEquals(" ", xml.getProperties().getCoreProperties().getTitle());
assertEquals(" ", xml.getProperties().getCoreProperties().getUnderlyingProperties().getSubjectProperty().getValue());
}
}

+ 40
- 0
src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java View File

@@ -37,6 +37,12 @@ public class TestXWPFWordExtractor extends TestCase {
*/
private XWPFDocument xmlB;
private File fileB;
/**
* File with hyperlinks
*/
private XWPFDocument xmlC;
private File fileC;

protected void setUp() throws Exception {
super.setUp();
@@ -49,11 +55,17 @@ public class TestXWPFWordExtractor extends TestCase {
System.getProperty("HWPF.testdata.path") +
File.separator + "IllustrativeCases.docx"
);
fileC = new File(
System.getProperty("HWPF.testdata.path") +
File.separator + "TestDocument.docx"
);
assertTrue(fileA.exists());
assertTrue(fileB.exists());
assertTrue(fileC.exists());
xmlA = new XWPFDocument(POIXMLDocument.openPackage(fileA.toString()));
xmlB = new XWPFDocument(POIXMLDocument.openPackage(fileB.toString()));
xmlC = new XWPFDocument(POIXMLDocument.openPackage(fileC.toString()));
}

/**
@@ -117,4 +129,32 @@ public class TestXWPFWordExtractor extends TestCase {
}
assertEquals(79, ps);
}
public void testGetWithHyperlinks() throws Exception {
XWPFWordExtractor extractor =
new XWPFWordExtractor(xmlC);
extractor.getText();
extractor.setFetchHyperlinks(true);
extractor.getText();

// Now check contents
// TODO - fix once correctly handling contents
extractor.setFetchHyperlinks(false);
assertEquals(
// "This is a test document\nThis bit is in bold and italic\n" +
// "Back to normal\nWe have a hyperlink here, and another.\n",
"This is a test document\nThis bit is in bold and italic\n" +
"Back to normal\nWe have a here, and .hyperlinkanother\n",
extractor.getText()
);
extractor.setFetchHyperlinks(true);
assertEquals(
// "This is a test document\nThis bit is in bold and italic\n" +
// "Back to normal\nWe have a hyperlink here, and another.\n",
"This is a test document\nThis bit is in bold and italic\n" +
"Back to normal\nWe have a here, and .hyperlink <http://poi.apache.org/>another\n",
extractor.getText()
);
}
}

BIN
src/scratchpad/testcases/org/apache/poi/hwpf/data/TestDocument.docx View File


Loading…
Cancel
Save