Browse Source

Tweak how we do ooxml properties, and handle hyperlinks for word documents when extracting

git-svn-id: https://svn.apache.org/repos/asf/poi/branches/ooxml@646298 13f79535-47bb-0310-9956-ffa450edef68
tags/REL_3_5_BETA2
Nick Burch 16 years ago
parent
commit
b2e48a2767

+ 12
- 24
src/ooxml/java/org/apache/poi/POIXMLDocument.java View File

import org.openxml4j.opc.PackageRelationshipCollection; import org.openxml4j.opc.PackageRelationshipCollection;
import org.openxml4j.opc.PackageRelationshipTypes; import org.openxml4j.opc.PackageRelationshipTypes;
import org.openxml4j.opc.PackagingURIHelper; import org.openxml4j.opc.PackagingURIHelper;
import org.openxml4j.opc.internal.PackagePropertiesPart;
import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.CTProperties;
import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.PropertiesDocument;


public abstract class POIXMLDocument { public abstract class POIXMLDocument {


/** The OPC core Package Part */ /** The OPC core Package Part */
private PackagePart corePart; private PackagePart corePart;
/**
* The properties of the OPC package, opened as needed
*/
private POIXMLProperties properties;
protected POIXMLDocument() {} protected POIXMLDocument() {}
protected POIXMLDocument(Package pkg) throws IOException { protected POIXMLDocument(Package pkg) throws IOException {
} }


/** /**
* Get the core document properties (core ooxml properties).
* TODO: Replace with nice usermodel wrapper
* @deprecated To be replaced with a proper user-model style view of the properties
* Get the document properties. This gives you access to the
* core ooxml properties, and the extended ooxml properties.
*/ */
public PackagePropertiesPart getCoreProperties() throws OpenXML4JException, IOException {
PackagePart propsPart = getSinglePartByRelationType(CORE_PROPERTIES_REL_TYPE);
if(propsPart == null) {
return null;
public POIXMLProperties getProperties() throws OpenXML4JException, IOException, XmlException {
if(properties == null) {
properties = new POIXMLProperties(pkg);
} }
return (PackagePropertiesPart)propsPart;
}
/**
* Get the extended document properties (extended ooxml properties)
* TODO: Replace with nice usermodel wrapper
* @deprecated To be replaced with a proper user-model style view of the properties
*/
public CTProperties getExtendedProperties() throws OpenXML4JException, XmlException, IOException {
PackagePart propsPart = getSinglePartByRelationType(EXTENDED_PROPERTIES_REL_TYPE);
PropertiesDocument props = PropertiesDocument.Factory.parse(
propsPart.getInputStream());
return props.getProperties();
return properties;
} }
} }

+ 124
- 0
src/ooxml/java/org/apache/poi/POIXMLProperties.java View File

/* ====================================================================
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
package org.apache.poi;

import java.io.IOException;

import org.apache.xmlbeans.XmlException;
import org.openxml4j.exceptions.OpenXML4JException;
import org.openxml4j.opc.Package;
import org.openxml4j.opc.PackageRelationshipCollection;
import org.openxml4j.opc.internal.PackagePropertiesPart;
import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.CTProperties;
import org.openxmlformats.schemas.officeDocument.x2006.extendedProperties.PropertiesDocument;

/**
* Wrapper around the two different kinds of OOXML properties
* a document can have
*/
public class POIXMLProperties {
private Package pkg;
private CoreProperties core;
private ExtendedProperties ext;
public POIXMLProperties(Package docPackage) throws IOException, OpenXML4JException, XmlException {
this.pkg = docPackage;
// Core properties
PackageRelationshipCollection coreRel =
pkg.getRelationshipsByType(POIXMLDocument.CORE_PROPERTIES_REL_TYPE);
if(coreRel.size() == 1) {
core = new CoreProperties( (PackagePropertiesPart)
pkg.getPart(coreRel.getRelationship(0)) );
} else {
throw new IllegalArgumentException("A document must always have core properties defined!");
}
// Extended properties
PackageRelationshipCollection extRel =
pkg.getRelationshipsByType(POIXMLDocument.EXTENDED_PROPERTIES_REL_TYPE);
if(extRel.size() == 1) {
PropertiesDocument props = PropertiesDocument.Factory.parse(
pkg.getPart( extRel.getRelationship(0) ).getInputStream()
);
ext = new ExtendedProperties(props);
} else {
ext = new ExtendedProperties(PropertiesDocument.Factory.newInstance());
}
}
/**
* Returns the core document properties
*/
public CoreProperties getCoreProperties() {
return core;
}
/**
* Returns the extended document properties
*/
public ExtendedProperties getExtendedProperties() {
return ext;
}
/**
* Writes out the ooxml properties into the supplied,
* new Package
*/
public void write(Package pkg) {
// TODO
}
/**
* The core document properties
*/
public class CoreProperties {
private PackagePropertiesPart part;
private CoreProperties(PackagePropertiesPart part) {
this.part = part;
}
public void setTitle(String title) {
part.setTitleProperty(title);
}
public String getTitle() {
return part.getTitleProperty().getValue();
}
public PackagePropertiesPart getUnderlyingProperties() {
return part;
}
}
/**
* Extended document properties
*/
public class ExtendedProperties {
private PropertiesDocument props;
private ExtendedProperties(PropertiesDocument props) {
this.props = props;
if(props.getProperties() == null) {
props.addNewProperties();
}
}
public CTProperties getUnderlyingProperties() {
return props.getProperties();
}
}
}

+ 19
- 0
src/ooxml/java/org/apache/poi/POIXMLTextExtractor.java View File

==================================================================== */ ==================================================================== */
package org.apache.poi; package org.apache.poi;


import java.io.IOException;

import org.apache.poi.POIXMLProperties.*;
import org.apache.xmlbeans.XmlException;
import org.openxml4j.exceptions.OpenXML4JException;

public abstract class POIXMLTextExtractor extends POITextExtractor { public abstract class POIXMLTextExtractor extends POITextExtractor {
/** The POIXMLDocument that's open */ /** The POIXMLDocument that's open */
protected POIXMLDocument document; protected POIXMLDocument document;
this.document = document; this.document = document;
} }
/**
* Returns the core document properties
*/
public CoreProperties getCoreProperties() throws IOException, OpenXML4JException, XmlException {
return document.getProperties().getCoreProperties();
}
/**
* Returns the extended document properties
*/
public ExtendedProperties getExtendedProperties() throws IOException, OpenXML4JException, XmlException {
return document.getProperties().getExtendedProperties();
}
} }

+ 16
- 0
src/ooxml/java/org/apache/poi/xwpf/XWPFDocument.java View File

import org.openxml4j.exceptions.OpenXML4JException; import org.openxml4j.exceptions.OpenXML4JException;
import org.openxml4j.opc.Package; import org.openxml4j.opc.Package;
import org.openxml4j.opc.PackagePart; import org.openxml4j.opc.PackagePart;
import org.openxml4j.opc.PackageRelationshipCollection;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBody; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBody;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTDocument1; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTDocument1;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTStyles; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTStyles;
public static final String HEADER_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml"; public static final String HEADER_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.header+xml";
public static final String STYLES_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml"; public static final String STYLES_CONTENT_TYPE = "application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml";
public static final String STYLES_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles"; public static final String STYLES_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles";
public static final String HYPERLINK_RELATION_TYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink";
private DocumentDocument wordDoc; private DocumentDocument wordDoc;
StylesDocument.Factory.parse(parts[0].getInputStream()); StylesDocument.Factory.parse(parts[0].getInputStream());
return sd.getStyles(); return sd.getStyles();
} }
/**
* Returns all the hyperlink relations for the file.
* You'll generally want to get the target to get
* the destination of the hyperlink
*/
public PackageRelationshipCollection getHyperlinks() {
try {
return getCorePart().getRelationshipsByType(HYPERLINK_RELATION_TYPE);
} catch(InvalidFormatException e) {
// Should never happen
throw new IllegalStateException(e);
}
}
} }

+ 35
- 3
src/ooxml/java/org/apache/poi/xwpf/extractor/XWPFWordExtractor.java View File

==================================================================== */ ==================================================================== */
package org.apache.poi.xwpf.extractor; package org.apache.poi.xwpf.extractor;


import java.io.File;
import java.io.IOException; import java.io.IOException;


import org.apache.poi.POIXMLDocument; import org.apache.poi.POIXMLDocument;
import org.apache.xmlbeans.XmlException; import org.apache.xmlbeans.XmlException;
import org.openxml4j.exceptions.OpenXML4JException; import org.openxml4j.exceptions.OpenXML4JException;
import org.openxml4j.opc.Package; import org.openxml4j.opc.Package;
import org.openxml4j.opc.PackageRelationship;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBody; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBody;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTHyperlink;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTP;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR;
import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText;
*/ */
public class XWPFWordExtractor extends POIXMLTextExtractor { public class XWPFWordExtractor extends POIXMLTextExtractor {
private XWPFDocument document; private XWPFDocument document;
private boolean fetchHyperlinks = false;
public XWPFWordExtractor(Package container) throws XmlException, OpenXML4JException, IOException { public XWPFWordExtractor(Package container) throws XmlException, OpenXML4JException, IOException {
this(new XWPFDocument(container)); this(new XWPFDocument(container));
)); ));
System.out.println(extractor.getText()); System.out.println(extractor.getText());
} }
/**
* Should we also fetch the hyperlinks, when fetching
* the text content? Default is to only output the
* hyperlink label, and not the contents
*/
public void setFetchHyperlinks(boolean fetch) {
fetchHyperlinks = fetch;
}


public String getText() { public String getText() {
CTBody body = document.getDocumentBody(); CTBody body = document.getDocumentBody();
// Loop over paragraphs // Loop over paragraphs
CTP[] ps = body.getPArray(); CTP[] ps = body.getPArray();
for (int i = 0; i < ps.length; i++) { for (int i = 0; i < ps.length; i++) {
// Loop over ranges
// Loop over ranges and hyperlinks
// TODO - properly intersperce ranges and hyperlinks
CTR[] rs = ps[i].getRArray(); CTR[] rs = ps[i].getRArray();
for (int j = 0; j < rs.length; j++) {
for(int j = 0; j < rs.length; j++) {
// Loop over text runs // Loop over text runs
CTText[] texts = rs[j].getTArray(); CTText[] texts = rs[j].getTArray();
for (int k = 0; k < texts.length; k++) { for (int k = 0; k < texts.length; k++) {
); );
} }
} }
CTHyperlink[] hls = ps[i].getHyperlinkArray();
for(CTHyperlink hl : hls) {
for(CTR r : hl.getRArray()) {
for(CTText txt : r.getTArray()) {
text.append(txt.getStringValue());
}
}
if(fetchHyperlinks) {
String id = hl.getId();
if(id != null) {
PackageRelationship hlRel =
document.getHyperlinks().getRelationshipByID(id);
if(hlRel != null) {
text.append(" <" + hlRel.getTargetURI().toString() + ">");
}
}
}
}
// New line after each paragraph. // New line after each paragraph.
text.append("\n"); text.append("\n");
} }

+ 8
- 8
src/ooxml/testcases/org/apache/poi/xslf/TestXSLFSlideShow.java View File

if(part.getContentType().equals(XSLFSlideShow.MAIN_CONTENT_TYPE)) { if(part.getContentType().equals(XSLFSlideShow.MAIN_CONTENT_TYPE)) {
found = true; found = true;
} }
System.out.println(part);
//System.out.println(part);
} }
assertTrue(found); assertTrue(found);
} }
public void testMetadataBasics() throws Exception { public void testMetadataBasics() throws Exception {
XSLFSlideShow xml = new XSLFSlideShow(sampleFile); XSLFSlideShow xml = new XSLFSlideShow(sampleFile);
assertNotNull(xml.getCoreProperties());
assertNotNull(xml.getExtendedProperties());
assertNotNull(xml.getProperties().getCoreProperties());
assertNotNull(xml.getProperties().getExtendedProperties());
assertEquals("Microsoft Office PowerPoint", xml.getExtendedProperties().getApplication());
assertEquals(0, xml.getExtendedProperties().getCharacters());
assertEquals(0, xml.getExtendedProperties().getLines());
assertEquals("Microsoft Office PowerPoint", xml.getProperties().getExtendedProperties().getUnderlyingProperties().getApplication());
assertEquals(0, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getCharacters());
assertEquals(0, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getLines());
assertEquals(null, xml.getCoreProperties().getTitleProperty().getValue());
assertEquals(null, xml.getCoreProperties().getSubjectProperty().getValue());
assertEquals(null, xml.getProperties().getCoreProperties().getTitle());
assertEquals(null, xml.getProperties().getCoreProperties().getUnderlyingProperties().getSubjectProperty().getValue());
} }
} }

+ 14
- 14
src/ooxml/testcases/org/apache/poi/xwpf/TestXWPFDocument.java View File

XWPFDocument xml = new XWPFDocument( XWPFDocument xml = new XWPFDocument(
POIXMLDocument.openPackage(sampleFile.toString()) POIXMLDocument.openPackage(sampleFile.toString())
); );
assertNotNull(xml.getCoreProperties());
assertNotNull(xml.getExtendedProperties());
assertNotNull(xml.getProperties().getCoreProperties());
assertNotNull(xml.getProperties().getExtendedProperties());
assertEquals("Microsoft Office Word", xml.getExtendedProperties().getApplication());
assertEquals(1315, xml.getExtendedProperties().getCharacters());
assertEquals(10, xml.getExtendedProperties().getLines());
assertEquals("Microsoft Office Word", xml.getProperties().getExtendedProperties().getUnderlyingProperties().getApplication());
assertEquals(1315, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getCharacters());
assertEquals(10, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getLines());
assertEquals(null, xml.getCoreProperties().getTitleProperty().getValue());
assertEquals(null, xml.getCoreProperties().getSubjectProperty().getValue());
assertEquals(null, xml.getProperties().getCoreProperties().getTitle());
assertEquals(null, xml.getProperties().getCoreProperties().getUnderlyingProperties().getSubjectProperty().getValue());
} }
public void testMetadataComplex() throws Exception { public void testMetadataComplex() throws Exception {
XWPFDocument xml = new XWPFDocument( XWPFDocument xml = new XWPFDocument(
POIXMLDocument.openPackage(complexFile.toString()) POIXMLDocument.openPackage(complexFile.toString())
); );
assertNotNull(xml.getCoreProperties());
assertNotNull(xml.getExtendedProperties());
assertNotNull(xml.getProperties().getCoreProperties());
assertNotNull(xml.getProperties().getExtendedProperties());
assertEquals("Microsoft Office Outlook", xml.getExtendedProperties().getApplication());
assertEquals(5184, xml.getExtendedProperties().getCharacters());
assertEquals(0, xml.getExtendedProperties().getLines());
assertEquals("Microsoft Office Outlook", xml.getProperties().getExtendedProperties().getUnderlyingProperties().getApplication());
assertEquals(5184, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getCharacters());
assertEquals(0, xml.getProperties().getExtendedProperties().getUnderlyingProperties().getLines());
assertEquals(" ", xml.getCoreProperties().getTitleProperty().getValue());
assertEquals(" ", xml.getCoreProperties().getSubjectProperty().getValue());
assertEquals(" ", xml.getProperties().getCoreProperties().getTitle());
assertEquals(" ", xml.getProperties().getCoreProperties().getUnderlyingProperties().getSubjectProperty().getValue());
} }
} }

+ 40
- 0
src/ooxml/testcases/org/apache/poi/xwpf/extractor/TestXWPFWordExtractor.java View File

*/ */
private XWPFDocument xmlB; private XWPFDocument xmlB;
private File fileB; private File fileB;
/**
* File with hyperlinks
*/
private XWPFDocument xmlC;
private File fileC;


protected void setUp() throws Exception { protected void setUp() throws Exception {
super.setUp(); super.setUp();
System.getProperty("HWPF.testdata.path") + System.getProperty("HWPF.testdata.path") +
File.separator + "IllustrativeCases.docx" File.separator + "IllustrativeCases.docx"
); );
fileC = new File(
System.getProperty("HWPF.testdata.path") +
File.separator + "TestDocument.docx"
);
assertTrue(fileA.exists()); assertTrue(fileA.exists());
assertTrue(fileB.exists()); assertTrue(fileB.exists());
assertTrue(fileC.exists());
xmlA = new XWPFDocument(POIXMLDocument.openPackage(fileA.toString())); xmlA = new XWPFDocument(POIXMLDocument.openPackage(fileA.toString()));
xmlB = new XWPFDocument(POIXMLDocument.openPackage(fileB.toString())); xmlB = new XWPFDocument(POIXMLDocument.openPackage(fileB.toString()));
xmlC = new XWPFDocument(POIXMLDocument.openPackage(fileC.toString()));
} }


/** /**
} }
assertEquals(79, ps); assertEquals(79, ps);
} }
public void testGetWithHyperlinks() throws Exception {
XWPFWordExtractor extractor =
new XWPFWordExtractor(xmlC);
extractor.getText();
extractor.setFetchHyperlinks(true);
extractor.getText();

// Now check contents
// TODO - fix once correctly handling contents
extractor.setFetchHyperlinks(false);
assertEquals(
// "This is a test document\nThis bit is in bold and italic\n" +
// "Back to normal\nWe have a hyperlink here, and another.\n",
"This is a test document\nThis bit is in bold and italic\n" +
"Back to normal\nWe have a here, and .hyperlinkanother\n",
extractor.getText()
);
extractor.setFetchHyperlinks(true);
assertEquals(
// "This is a test document\nThis bit is in bold and italic\n" +
// "Back to normal\nWe have a hyperlink here, and another.\n",
"This is a test document\nThis bit is in bold and italic\n" +
"Back to normal\nWe have a here, and .hyperlink <http://poi.apache.org/>another\n",
extractor.getText()
);
}
} }

BIN
src/scratchpad/testcases/org/apache/poi/hwpf/data/TestDocument.docx View File


Loading…
Cancel
Save