<changes>
<release version="3.8-beta4" date="2011-??-??">
+ <action dev="poi-developers" type="add">Add Word-to-Text converter and use it as replacement for WordExtractor</action>
<action dev="poi-developers" type="fix">51604 - replace text fails for doc ( poi 3.8 beta release from download site )</action>
<action dev="poi-developers" type="fix">Fixed incorrect encoding of non-breaking space (0xA0) in SXSSF</action>
<action dev="poi-developers" type="add">Support for conditional formatting in XSSF</action>
import org.apache.poi.hpsf.DocumentSummaryInformation;
import org.apache.poi.hpsf.SummaryInformation;
import org.apache.poi.hpsf.extractor.HPSFPropertiesExtractor;
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
/**
public POIOLE2TextExtractor(POIDocument document) {
super(document);
}
-
+
/**
* Returns the document information metadata for the document
*/
public SummaryInformation getSummaryInformation() {
return document.getSummaryInformation();
}
-
+
/**
- * Returns an HPSF powered text extractor for the
+ * Returns an HPSF powered text extractor for the
* document properties metadata, such as title and author.
*/
public POITextExtractor getMetadataTextExtractor() {
return new HPSFPropertiesExtractor(this);
}
- /**
- * Return the underlying POIFS FileSystem of
- * this document.
- */
- public POIFSFileSystem getFileSystem() {
- return document.directory.getFileSystem();
- }
+ public DirectoryEntry getRoot()
+ {
+ return document.directory;
+ }
+
+ /**
+ * Return the underlying POIFS FileSystem of this document.
+ *
+ * @deprecated Use {@link #getRoot()} instead
+ */
+ @Deprecated
+ public POIFSFileSystem getFileSystem()
+ {
+ return document.directory.getFileSystem();
+ }
}
*/
public class EventBasedExcelExtractor extends POIOLE2TextExtractor {
private DirectoryNode _dir;
- private POIFSFileSystem _fs;
boolean _includeSheetNames = true;
boolean _formulasNotResults = false;
- public EventBasedExcelExtractor(DirectoryNode dir, POIFSFileSystem fs) {
- super(null);
- _dir = dir;
- _fs = fs;
- }
+ /**
+ * @deprecated Use {@link #EventBasedExcelExtractor(DirectoryNode)} instead
+ */
+ @Deprecated
+ @SuppressWarnings( "unused" )
+ public EventBasedExcelExtractor( DirectoryNode dir, POIFSFileSystem fs )
+ {
+ this( dir );
+ }
+
+ public EventBasedExcelExtractor( DirectoryNode dir )
+ {
+ super( null );
+ _dir = dir;
+ }
+
public EventBasedExcelExtractor(POIFSFileSystem fs) {
- this(fs.getRoot(), fs);
+ this(fs.getRoot());
}
/**
* this document.
*/
public POIFSFileSystem getFileSystem() {
- return _fs;
+ return _dir.getFileSystem();
}
-
+
/**
* Would return the document information metadata for the document,
* if we supported it
outputNextStringValue = true;
nextRow = frec.getRow();
} else {
- thisText = _ft.formatNumberDateCell(frec);
+ thisText = _ft.formatNumberDateCell(frec);
}
}
break;
case NumberRecord.sid:
NumberRecord numrec = (NumberRecord) record;
thisRow = numrec.getRow();
- thisText = _ft.formatNumberDateCell(numrec);
+ thisText = _ft.formatNumberDateCell(numrec);
break;
default:
break;
import java.io.PrintStream;
import org.apache.poi.POIOLE2TextExtractor;
-import org.apache.poi.ss.formula.eval.ErrorEval;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFCellStyle;
import org.apache.poi.hssf.usermodel.HSSFComment;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.ss.formula.eval.ErrorEval;
import org.apache.poi.ss.usermodel.HeaderFooter;
/**
* A text extractor for Excel files.
* <p>
- * Returns the textual content of the file, suitable for
+ * Returns the textual content of the file, suitable for
* indexing by something like Lucene, but not really
* intended for display to the user.
* </p>
private boolean _includeCellComments = false;
private boolean _includeBlankCells = false;
private boolean _includeHeadersFooters = true;
-
+
public ExcelExtractor(HSSFWorkbook wb) {
super(wb);
_wb = wb;
_formatter = new HSSFDataFormatter();
}
public ExcelExtractor(POIFSFileSystem fs) throws IOException {
- this(fs.getRoot(), fs);
+ this(fs.getRoot());
}
- public ExcelExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
- this(new HSSFWorkbook(dir, fs, true));
+ /**
+ * @deprecated Use {@link #ExcelExtractor(DirectoryNode)} instead
+ */
+ @Deprecated
+ @SuppressWarnings( "unused" )
+ public ExcelExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
+ this( dir );
+ }
+ public ExcelExtractor(DirectoryNode dir) throws IOException {
+ this(new HSSFWorkbook(dir, true));
}
-
+
private static final class CommandParseException extends Exception {
public CommandParseException(String msg) {
super(msg);
return _headersFooters;
}
}
-
+
private static void printUsageMessage(PrintStream ps) {
ps.println("Use:");
ps.println(" " + ExcelExtractor.class.getName() + " [<flag> <value> [<flag> <value> [...]]] [-i <filename.xls>]");
* Command line extractor.
*/
public static void main(String[] args) {
-
+
CommandArgs cmdArgs;
try {
cmdArgs = new CommandArgs(args);
System.exit(1);
return; // suppress compiler error
}
-
+
if (cmdArgs.isRequestHelp()) {
printUsageMessage(System.out);
return;
}
-
+
try {
InputStream is;
if(cmdArgs.getInputFile() == null) {
* Default is to include them.
*/
public void setIncludeHeadersFooters(boolean includeHeadersFooters) {
- _includeHeadersFooters = includeHeadersFooters;
+ _includeHeadersFooters = includeHeadersFooters;
}
-
+
/**
* Retrieves the text contents of the file
*/
// We don't care about the difference between
// null (missing) and blank cells
_wb.setMissingCellPolicy(HSSFRow.RETURN_BLANK_AS_NULL);
-
+
// Process each sheet in turn
for(int i=0;i<_wb.getNumberOfSheets();i++) {
HSSFSheet sheet = _wb.getSheetAt(i);
if(sheet == null) { continue; }
-
+
if(_includeSheetNames) {
String name = _wb.getSheetName(i);
if(name != null) {
text.append("\n");
}
}
-
+
// Header text, if there is any
if(_includeHeadersFooters) {
text.append(_extractHeaderFooter(sheet.getHeader()));
}
-
+
int firstRow = sheet.getFirstRowNum();
int lastRow = sheet.getLastRowNum();
for(int j=firstRow;j<=lastRow;j++) {
if(_includeBlankCells) {
firstCell = 0;
}
-
+
for(int k=firstCell;k<lastCell;k++) {
HSSFCell cell = row.getCell(k);
boolean outputContents = true;
case HSSFCell.CELL_TYPE_ERROR:
text.append(ErrorEval.getText(cell.getErrorCellValue()));
break;
-
+
}
}
break;
default:
throw new RuntimeException("Unexpected cell type (" + cell.getCellType() + ")");
}
-
+
// Output the comment, if requested and exists
HSSFComment comment = cell.getCellComment();
if(_includeCellComments && comment != null) {
text.append(" Comment by "+comment.getAuthor()+": "+commentText);
}
}
-
+
// Output a tab if we're not on the last cell
if(outputContents && k < (lastCell-1)) {
text.append("\t");
}
}
-
+
// Finish off the row
text.append("\n");
}
-
+
// Finally Footer text, if there is any
if(_includeHeadersFooters) {
text.append(_extractHeaderFooter(sheet.getFooter()));
}
}
-
+
return text.toString();
}
-
+
public static String _extractHeaderFooter(HeaderFooter hf) {
StringBuffer text = new StringBuffer();
-
+
if(hf.getLeft() != null) {
text.append(hf.getLeft());
}
}
if(text.length() > 0)
text.append("\n");
-
+
return text.toString();
}
}
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
-
-package org.apache.poi.poifs.filesystem;
-import java.io.*;
+package org.apache.poi.poifs.filesystem;
-import java.util.*;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Iterator;
import org.apache.poi.hpsf.ClassID;
public int getEntryCount();
+ /**
+ * Checks if entry with specified name present
+ */
+
+ public boolean hasEntry( final String name );
+
/**
* get a specified Entry by name
*
See the License for the specific language governing permissions and
limitations under the License.
==================================================================== */
-
+
package org.apache.poi.poifs.filesystem;
// the POIFSFileSystem we belong to
private POIFSFileSystem _ofilesystem;
// the NPOIFSFileSytem we belong to
- private NPOIFSFileSystem _nfilesystem;
+ private NPOIFSFileSystem _nfilesystem;
// the path described by this document
private POIFSDocumentPath _path;
{
this(property, parent, filesystem, (NPOIFSFileSystem)null);
}
-
+
/**
* create a DirectoryNode. This method is not public by design; it
* is intended strictly for the internal use of this package
{
this(property, parent, (POIFSFileSystem)null, nfilesystem);
}
-
+
private DirectoryNode(final DirectoryProperty property,
final DirectoryNode parent,
final POIFSFileSystem ofilesystem,
super(property, parent);
this._ofilesystem = ofilesystem;
this._nfilesystem = nfilesystem;
-
+
if (parent == null)
{
_path = new POIFSDocumentPath();
{
return _path;
}
-
+
/**
* @return the filesystem that this belongs to
*/
public POIFSFileSystem getFileSystem()
{
- return _ofilesystem;
+ return _ofilesystem;
}
-
+
/**
* @return the filesystem that this belongs to
*/
public NPOIFSFileSystem getNFileSystem()
{
- return _nfilesystem;
+ return _nfilesystem;
}
-
+
/**
* open a document in the directory's entry's list of entries
*
throw new IOException("Entry '" + document.getName()
+ "' is not a DocumentEntry");
}
-
+
DocumentEntry entry = (DocumentEntry)document;
return new DocumentInputStream(entry);
}
(( DirectoryProperty ) getProperty()).addChild(property);
_ofilesystem.addDocument(document);
-
+
_entries.add(rval);
_byname.put(property.getName(), rval);
return rval;
(( DirectoryProperty ) getProperty()).addChild(property);
_nfilesystem.addDocument(document);
-
+
_entries.add(rval);
_byname.put(property.getName(), rval);
return rval;
{
_entries.remove(entry);
_byname.remove(entry.getName());
-
+
if(_ofilesystem != null) {
_ofilesystem.remove(entry);
} else {
return _entries.size();
}
+ public boolean hasEntry( String name )
+ {
+ return name != null && _byname.containsKey( name );
+ }
+
/**
* get a specified Entry by name
*
{
DirectoryNode rval;
DirectoryProperty property = new DirectoryProperty(name);
-
+
if(_ofilesystem != null) {
rval = new DirectoryNode(property, _ofilesystem, this);
_ofilesystem.addDirectory(property);
* Returns an Iterator over all the entries
*/
public Iterator<Entry> iterator() {
- return getEntries();
+ return getEntries();
}
/* ********** END begin implementation of POIFSViewable ********** */
public class ExtractorFactory {
public static final String CORE_DOCUMENT_REL =
"http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument";
-
-
+
+
/** Should this thread prefer event based over usermodel based extractors? */
private static final ThreadLocal<Boolean> threadPreferEventExtractors = new ThreadLocal<Boolean>() {
protected Boolean initialValue() { return Boolean.FALSE; }
};
/** Should all threads prefer event based over usermodel based extractors? */
private static Boolean allPreferEventExtractors;
-
- /**
+
+ /**
* Should this thread prefer event based over usermodel based extractors?
- * (usermodel extractors tend to be more accurate, but use more memory)
- * Default is false.
+ * (usermodel extractors tend to be more accurate, but use more memory)
+ * Default is false.
*/
public static boolean getThreadPrefersEventExtractors() {
return threadPreferEventExtractors.get();
}
- /**
- * Should all threads prefer event based over usermodel based extractors?
- * (usermodel extractors tend to be more accurate, but use more memory)
- * Default is to use the thread level setting, which defaults to false.
+ /**
+ * Should all threads prefer event based over usermodel based extractors?
+ * (usermodel extractors tend to be more accurate, but use more memory)
+ * Default is to use the thread level setting, which defaults to false.
*/
public static Boolean getAllThreadsPreferEventExtractors() {
return allPreferEventExtractors;
}
-
- /**
+
+ /**
* Should this thread prefer event based over usermodel based extractors?
- * Will only be used if the All Threads setting is null.
+ * Will only be used if the All Threads setting is null.
*/
public static void setThreadPrefersEventExtractors(boolean preferEventExtractors) {
threadPreferEventExtractors.set(preferEventExtractors);
}
- /**
+ /**
* Should all threads prefer event based over usermodel based extractors?
- * If set, will take preference over the Thread level setting.
+ * If set, will take preference over the Thread level setting.
*/
public static void setAllThreadsPreferEventExtractors(Boolean preferEventExtractors) {
allPreferEventExtractors = preferEventExtractors;
}
-
-
+
+
/**
* Should this thread use event based extractors is available?
* Checks the all-threads one first, then thread specific.
}
return threadPreferEventExtractors.get();
}
-
-
+
+
public static POITextExtractor createExtractor(File f) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
InputStream inp = null;
try {
if(inp != null) inp.close();
}
}
-
+
public static POITextExtractor createExtractor(InputStream inp) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
// Figure out the kind of stream
// If clearly doesn't do mark/reset, wrap up
if(! inp.markSupported()) {
inp = new PushbackInputStream(inp, 8);
}
-
+
if(POIFSFileSystem.hasPOIFSHeader(inp)) {
return createExtractor(new POIFSFileSystem(inp));
}
}
throw new IllegalArgumentException("Your InputStream was neither an OLE2 stream, nor an OOXML stream");
}
-
+
public static POIXMLTextExtractor createExtractor(OPCPackage pkg) throws IOException, OpenXML4JException, XmlException {
- PackageRelationshipCollection core =
+ PackageRelationshipCollection core =
pkg.getRelationshipsByType(CORE_DOCUMENT_REL);
if(core.size() != 1) {
throw new IllegalArgumentException("Invalid OOXML Package received - expected 1 core document, found " + core.size());
}
PackagePart corePart = pkg.getPart(core.getRelationship(0));
-
+
// Is it XSSF?
for(XSSFRelation rel : XSSFExcelExtractor.SUPPORTED_TYPES) {
if(corePart.getContentType().equals(rel.getContentType())) {
}
}
}
-
+
// Is it XWPF?
for(XWPFRelation rel : XWPFWordExtractor.SUPPORTED_TYPES) {
if(corePart.getContentType().equals(rel.getContentType())) {
return new XWPFWordExtractor(pkg);
}
}
-
+
// Is it XSLF?
for(XSLFRelation rel : XSLFPowerPointExtractor.SUPPORTED_TYPES) {
if(corePart.getContentType().equals(rel.getContentType())) {
return new XSLFPowerPointExtractor(pkg);
}
}
-
+
throw new IllegalArgumentException("No supported documents found in the OOXML package (found "+corePart.getContentType()+")");
}
-
+
public static POIOLE2TextExtractor createExtractor(POIFSFileSystem fs) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
// Only ever an OLE2 one from the root of the FS
- return (POIOLE2TextExtractor)createExtractor(fs.getRoot(), fs);
+ return (POIOLE2TextExtractor)createExtractor(fs.getRoot());
}
- public static POITextExtractor createExtractor(DirectoryNode poifsDir, POIFSFileSystem fs) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
- // Look for certain entries in the stream, to figure it
- // out from
- for(Iterator<Entry> entries = poifsDir.getEntries(); entries.hasNext(); ) {
- Entry entry = entries.next();
-
- if(entry.getName().equals("Workbook")) {
- if(getPreferEventExtractor()) {
- return new EventBasedExcelExtractor(poifsDir, fs);
- } else {
- return new ExcelExtractor(poifsDir, fs);
- }
- }
- if(entry.getName().equals("WordDocument")) {
- // Old or new style word document?
- try {
- return new WordExtractor(poifsDir, fs);
- } catch(OldWordFileFormatException e) {
- return new Word6Extractor(poifsDir, fs);
- }
- }
- if(entry.getName().equals("PowerPoint Document")) {
- return new PowerPointExtractor(poifsDir, fs);
- }
- if(entry.getName().equals("VisioDocument")) {
- return new VisioTextExtractor(poifsDir, fs);
- }
- if(entry.getName().equals("Quill")) {
- return new PublisherTextExtractor(poifsDir, fs);
- }
- if(
- entry.getName().equals("__substg1.0_1000001E") ||
- entry.getName().equals("__substg1.0_1000001F") ||
- entry.getName().equals("__substg1.0_0047001E") ||
- entry.getName().equals("__substg1.0_0047001F") ||
- entry.getName().equals("__substg1.0_0037001E") ||
- entry.getName().equals("__substg1.0_0037001F")
- ) {
- return new OutlookTextExtactor(poifsDir, fs);
- }
- if(entry.getName().equals("Package")) {
- OPCPackage pkg = OPCPackage.open(
- poifsDir.createDocumentInputStream(entry.getName())
- );
- return createExtractor(pkg);
- }
- }
- throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
- }
-
-
+
+ /**
+ * @deprecated Use {@link #createExtractor(DirectoryNode)} instead
+ */
+ @Deprecated
+ @SuppressWarnings("unused")
+ public static POITextExtractor createExtractor(DirectoryNode poifsDir, POIFSFileSystem fs)
+ throws IOException, InvalidFormatException, OpenXML4JException, XmlException
+ {
+ return createExtractor(poifsDir);
+ }
+
+ public static POITextExtractor createExtractor(DirectoryNode poifsDir) throws IOException,
+ InvalidFormatException, OpenXML4JException, XmlException
+ {
+ // Look for certain entries in the stream, to figure it
+ // out from
+ if (poifsDir.hasEntry("Workbook")) {
+ if (getPreferEventExtractor()) {
+ return new EventBasedExcelExtractor(poifsDir);
+ }
+ return new ExcelExtractor(poifsDir);
+ }
+
+ if (poifsDir.hasEntry("WordDocument")) {
+ // Old or new style word document?
+ try {
+ return new WordExtractor(poifsDir);
+ } catch (OldWordFileFormatException e) {
+ return new Word6Extractor(poifsDir);
+ }
+ }
+
+ if (poifsDir.hasEntry("PowerPoint Document")) {
+ return new PowerPointExtractor(poifsDir);
+ }
+
+ if (poifsDir.hasEntry("VisioDocument")) {
+ return new VisioTextExtractor(poifsDir);
+ }
+
+ if (poifsDir.hasEntry("Quill")) {
+ return new PublisherTextExtractor(poifsDir);
+ }
+
+ if (poifsDir.hasEntry("__substg1.0_1000001E") || poifsDir.hasEntry("__substg1.0_1000001F")
+ || poifsDir.hasEntry("__substg1.0_0047001E")
+ || poifsDir.hasEntry("__substg1.0_0047001F")
+ || poifsDir.hasEntry("__substg1.0_0037001E")
+ || poifsDir.hasEntry("__substg1.0_0037001F"))
+ {
+ return new OutlookTextExtactor(poifsDir);
+ }
+
+ for (Iterator<Entry> entries = poifsDir.getEntries(); entries.hasNext();) {
+ Entry entry = entries.next();
+
+ if (entry.getName().equals("Package")) {
+ OPCPackage pkg = OPCPackage.open(poifsDir.createDocumentInputStream("Package"));
+ return createExtractor(pkg);
+ }
+ }
+ throw new IllegalArgumentException("No supported documents found in the OLE2 stream");
+ }
+
/**
* Returns an array of text extractors, one for each of
* the embeded documents in the file (if there are any).
* If there are no embeded documents, you'll get back an
- * empty array. Otherwise, you'll get one open
+ * empty array. Otherwise, you'll get one open
* {@link POITextExtractor} for each embeded file.
*/
public static POITextExtractor[] getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException, InvalidFormatException, OpenXML4JException, XmlException {
ArrayList<Entry> dirs = new ArrayList<Entry>();
// For anything else not directly held in as a POIFS directory
ArrayList<InputStream> nonPOIFS = new ArrayList<InputStream>();
-
+
// Find all the embeded directories
- POIFSFileSystem fs = ext.getFileSystem();
- if(fs == null) {
+ DirectoryEntry root = ext.getRoot();
+ if(root == null) {
throw new IllegalStateException("The extractor didn't know which POIFS it came from!");
}
-
+
if(ext instanceof ExcelExtractor) {
// These are in MBD... under the root
- Iterator<Entry> it = fs.getRoot().getEntries();
+ Iterator<Entry> it = root.getEntries();
while(it.hasNext()) {
Entry entry = it.next();
if(entry.getName().startsWith("MBD")) {
// These are in ObjectPool -> _... under the root
try {
DirectoryEntry op = (DirectoryEntry)
- fs.getRoot().getEntry("ObjectPool");
+ root.getEntry("ObjectPool");
Iterator<Entry> it = op.getEntries();
while(it.hasNext()) {
Entry entry = it.next();
}
}
}
-
+
// Create the extractors
if(
(dirs == null || dirs.size() == 0) &&
){
return new POITextExtractor[0];
}
-
+
ArrayList<POITextExtractor> e = new ArrayList<POITextExtractor>();
for(int i=0; i<dirs.size(); i++) {
e.add( createExtractor(
- (DirectoryNode)dirs.get(i), ext.getFileSystem()
+ (DirectoryNode)dirs.get(i)
) );
}
for(int i=0; i<nonPOIFS.size(); i++) {
* Returns an array of text extractors, one for each of
* the embeded documents in the file (if there are any).
* If there are no embeded documents, you'll get back an
- * empty array. Otherwise, you'll get one open
+ * empty array. Otherwise, you'll get one open
* {@link POITextExtractor} for each embeded file.
*/
public static POITextExtractor[] getEmbededDocsTextExtractors(POIXMLTextExtractor ext) {
import java.io.InputStream;
import java.io.OutputStream;
+import org.apache.poi.hwpf.usermodel.ObjectPoolImpl;
+
import org.apache.poi.hwpf.model.BookmarksTables;
import org.apache.poi.hwpf.model.CHPBinTable;
import org.apache.poi.hwpf.model.CPSplitCalculator;
* @param pfilesystem The POIFSFileSystem that contains the Word document.
* @throws IOException If there is an unexpected IOException from the passed
* in POIFSFileSystem.
+ * @deprecated Use {@link #HWPFDocument(DirectoryNode)} instead
*/
+ @Deprecated
public HWPFDocument(DirectoryNode directory, POIFSFileSystem pfilesystem) throws IOException
{
this(directory);
package org.apache.poi.hwpf;
+import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.PushbackInputStream;
+import org.apache.poi.hwpf.usermodel.ObjectsPool;
+
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
+
+import org.apache.poi.hwpf.usermodel.ObjectPoolImpl;
+
import org.apache.poi.EncryptedDocumentException;
import org.apache.poi.POIDocument;
import org.apache.poi.hwpf.model.CHPBinTable;
*/
public abstract class HWPFDocumentCore extends POIDocument
{
+ /** Holds OLE2 objects */
+ protected ObjectPoolImpl _objectPool;
+
/** The FIB */
protected FileInformationBlock _fib;
if(_fib.isFEncrypted()) {
throw new EncryptedDocumentException("Cannot process encrypted word files!");
}
- }
+
+ {
+ DirectoryEntry objectPoolEntry;
+ try
+ {
+ objectPoolEntry = (DirectoryEntry) directory
+ .getEntry( "ObjectPool" );
+ }
+ catch ( FileNotFoundException exc )
+ {
+ objectPoolEntry = directory.createDirectory( "ObjectPool" );
+ }
+ _objectPool = new ObjectPoolImpl( objectPoolEntry );
+ }
+ }
/**
* Returns the range which covers the whole of the document, but excludes
return _fib;
}
+ public ObjectsPool getObjectsPool()
+ {
+ return _objectPool;
+ }
+
public abstract TextPieceTable getTextTable();
}
this(fs.getRoot());
}
+ @Deprecated
public HWPFOldDocument(DirectoryNode directory, POIFSFileSystem fs)
throws IOException {
this(directory);
import org.apache.poi.hwpf.usermodel.Table;
import org.apache.poi.hwpf.usermodel.TableCell;
import org.apache.poi.hwpf.usermodel.TableRow;
+import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.util.Beta;
import org.apache.poi.util.POILogFactory;
import org.apache.poi.util.POILogger;
@Beta
public abstract class AbstractWordConverter
{
+ private static final class Structure implements Comparable<Structure>
+ {
+ final int end;
+ final int start;
+ final Object structure;
+
+ Structure( Bookmark bookmark )
+ {
+ this.start = bookmark.getStart();
+ this.end = bookmark.getEnd();
+ this.structure = bookmark;
+ }
+
+ Structure( Field field )
+ {
+ this.start = field.getFieldStartOffset();
+ this.end = field.getFieldEndOffset();
+ this.structure = field;
+ }
+
+ public int compareTo( Structure o )
+ {
+ return start < o.start ? -1 : start == o.start ? 0 : 1;
+ }
+ }
+
private static final byte BEL_MARK = 7;
private static final byte FIELD_BEGIN_MARK = 19;
processDrawnObject( doc, characterRun, block );
continue;
}
+ if ( characterRun.isOle2()
+ && ( wordDocument instanceof HWPFDocument ) )
+ {
+ HWPFDocument doc = (HWPFDocument) wordDocument;
+ processOle2( doc, characterRun, block );
+ continue;
+ }
}
if ( text.getBytes()[0] == FIELD_BEGIN_MARK )
CharacterRun characterRun, OfficeDrawing officeDrawing,
String path, Element block );
- protected abstract void processEndnoteAutonumbered( HWPFDocument wordDocument,
- int noteIndex, Element block, Range endnoteTextRange );
+ protected abstract void processEndnoteAutonumbered(
+ HWPFDocument wordDocument, int noteIndex, Element block,
+ Range endnoteTextRange );
- protected void processField( HWPFDocument hwpfDocument, Range parentRange,
+ protected void processField( HWPFDocument wordDocument, Range parentRange,
int currentTableLevel, Field field, Element currentBlock )
{
switch ( field.getType() )
if ( matcher.find() )
{
String pageref = matcher.group( 1 );
- processPageref( hwpfDocument, currentBlock,
+ processPageref( wordDocument, currentBlock,
field.secondSubrange( parentRange ),
currentTableLevel, pageref );
return;
}
break;
}
+ case 58: // Embedded Object
+ {
+ if ( !field.hasSeparator() )
+ {
+ logger.log( POILogger.WARN, parentRange + " contains " + field
+ + " with 'Embedded Object' but without separator mark" );
+ return;
+ }
+
+ CharacterRun separator = field
+ .getMarkSeparatorCharacterRun( parentRange );
+
+ if ( separator.isOle2() )
+ {
+ // the only supported so far
+ boolean processed = processOle2( wordDocument, separator,
+ currentBlock );
+
+ // if we didn't output OLE - output field value
+ if ( !processed )
+ {
+ processCharacters( wordDocument, currentTableLevel,
+ field.secondSubrange( parentRange ), currentBlock );
+ }
+
+ return;
+ }
+
+ break;
+ }
case 88: // hyperlink
{
final Range firstSubrange = field.firstSubrange( parentRange );
if ( matcher.find() )
{
String hyperlink = matcher.group( 1 );
- processHyperlink( hwpfDocument, currentBlock,
+ processHyperlink( wordDocument, currentBlock,
field.secondSubrange( parentRange ),
currentTableLevel, hyperlink );
return;
logger.log( POILogger.WARN, parentRange + " contains " + field
+ " with unsupported type or format" );
- processCharacters( hwpfDocument, currentTableLevel,
+ processCharacters( wordDocument, currentTableLevel,
field.secondSubrange( parentRange ), currentBlock );
}
- protected abstract void processFootnoteAutonumbered( HWPFDocument wordDocument,
- int noteIndex, Element block, Range footnoteTextRange );
+ protected abstract void processFootnoteAutonumbered(
+ HWPFDocument wordDocument, int noteIndex, Element block,
+ Range footnoteTextRange );
protected abstract void processHyperlink( HWPFDocumentCore wordDocument,
Element currentBlock, Range textRange, int currentTableLevel,
}
}
+ private boolean processOle2( HWPFDocument doc, CharacterRun characterRun,
+ Element block )
+ {
+ Entry entry = doc.getObjectsPool().getObjectById(
+ "_" + characterRun.getPicOffset() );
+ if ( entry == null )
+ {
+ logger.log( POILogger.WARN, "Referenced OLE2 object '",
+ Integer.valueOf( characterRun.getPicOffset() ),
+ "' not found in ObjectPool" );
+ return false;
+ }
+
+ try
+ {
+ return processOle2( doc, block, entry );
+ }
+ catch ( Exception exc )
+ {
+ logger.log( POILogger.WARN,
+ "Unable to convert internal OLE2 object '",
+ Integer.valueOf( characterRun.getPicOffset() ), "': ", exc,
+ exc );
+ return false;
+ }
+ }
+
+ @SuppressWarnings( "unused" )
+ protected boolean processOle2( HWPFDocument wordDocument, Element block,
+ Entry entry ) throws Exception
+ {
+ return false;
+ }
+
protected abstract void processPageref( HWPFDocumentCore wordDocument,
Element currentBlock, Range textRange, int currentTableLevel,
String pageref );
return endMark;
}
- private static final class Structure implements Comparable<Structure>
- {
- final int end;
- final int start;
- final Object structure;
-
- Structure( Bookmark bookmark )
- {
- this.start = bookmark.getStart();
- this.end = bookmark.getEnd();
- this.structure = bookmark;
- }
-
- Structure( Field field )
- {
- this.start = field.getFieldStartOffset();
- this.end = field.getFieldEndOffset();
- this.structure = field;
- }
-
- public int compareTo( Structure o )
- {
- return start < o.start ? -1 : start == o.start ? 0 : 1;
- }
- }
-
}
import org.apache.poi.hwpf.usermodel.Table;
import org.apache.poi.hwpf.usermodel.TableCell;
import org.apache.poi.hwpf.usermodel.TableRow;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.util.Beta;
import org.apache.poi.util.IOUtils;
return !isEmpty( str );
}
+ public static HWPFDocumentCore loadDoc( final DirectoryNode root )
+ throws IOException
+ {
+ try
+ {
+ return new HWPFDocument( root );
+ }
+ catch ( OldWordFileFormatException exc )
+ {
+ return new HWPFOldDocument( root );
+ }
+ }
+
public static HWPFDocumentCore loadDoc( File docFile ) throws IOException
{
final FileInputStream istream = new FileInputStream( docFile );
public static HWPFDocumentCore loadDoc( InputStream inputStream )
throws IOException
{
- final POIFSFileSystem poifsFileSystem = HWPFDocumentCore
- .verifyAndBuildPOIFS( inputStream );
- try
- {
- return new HWPFDocument( poifsFileSystem );
- }
- catch ( OldWordFileFormatException exc )
- {
- return new HWPFOldDocument( poifsFileSystem );
- }
+ return loadDoc( HWPFDocumentCore.verifyAndBuildPOIFS( inputStream ) );
+ }
+
+ public static HWPFDocumentCore loadDoc(
+ final POIFSFileSystem poifsFileSystem ) throws IOException
+ {
+ return loadDoc( poifsFileSystem.getRoot() );
}
static String substringBeforeLast( String str, String separator )
}
@Override
- protected void processEndnoteAutonumbered( HWPFDocument wordDocument, int noteIndex,
- Element block, Range endnoteTextRange )
+ protected void processEndnoteAutonumbered( HWPFDocument wordDocument,
+ int noteIndex, Element block, Range endnoteTextRange )
{
final String textIndex = String.valueOf( internalLinkCounter
.incrementAndGet() );
setId( backwardLink, forwardLinkName );
endnote.appendChild( backwardLink );
- processCharacters( wordDocument, Integer.MIN_VALUE, endnoteTextRange, endnote );
+ processCharacters( wordDocument, Integer.MIN_VALUE, endnoteTextRange,
+ endnote );
WordToFoUtils.compactInlines( endnote );
this.endnotes.add( endnote );
@Beta
public class WordToHtmlConverter extends AbstractWordConverter
{
-
/**
* Holds properties values, applied to current <tt>p</tt> element. Those
* properties shall not be doubled in children <tt>span</tt> elements.
}
@Override
- protected void processEndnoteAutonumbered( HWPFDocument wordDocument, int noteIndex,
- Element block, Range endnoteTextRange )
+ protected void processEndnoteAutonumbered( HWPFDocument wordDocument,
+ int noteIndex, Element block, Range endnoteTextRange )
{
- processNoteAutonumbered( wordDocument, "end", noteIndex, block, endnoteTextRange );
+ processNoteAutonumbered( wordDocument, "end", noteIndex, block,
+ endnoteTextRange );
}
@Override
import java.io.File;
import java.io.FileWriter;
+import java.io.StringWriter;
+import java.lang.reflect.Method;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
+import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import org.apache.poi.hwpf.usermodel.Table;
import org.apache.poi.hwpf.usermodel.TableCell;
import org.apache.poi.hwpf.usermodel.TableRow;
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.util.Beta;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
public class WordToTextConverter extends AbstractWordConverter
{
+ public static String getText( DirectoryNode root ) throws Exception
+ {
+ final HWPFDocumentCore wordDocument = AbstractWordUtils.loadDoc( root );
+ return getText( wordDocument );
+ }
+
+ public static String getText( File docFile ) throws Exception
+ {
+ final HWPFDocumentCore wordDocument = AbstractWordUtils
+ .loadDoc( docFile );
+ return getText( wordDocument );
+ }
+
+ public static String getText( final HWPFDocumentCore wordDocument )
+ throws Exception
+ {
+ WordToTextConverter wordToTextConverter = new WordToTextConverter(
+ DocumentBuilderFactory.newInstance().newDocumentBuilder()
+ .newDocument() );
+ wordToTextConverter.processDocument( wordDocument );
+ return wordToTextConverter.getText();
+ }
+
/**
* Java main() interface to interact with {@link WordToTextConverter}
*
private Element notes = null;
+ private boolean outputSummaryInformation = false;
+
private final TextDocumentFacade textDocumentFacade;
+ /**
+ * Creates new instance of {@link WordToTextConverter}. Can be used for
+ * output several {@link HWPFDocument}s into single text document.
+ *
+ * @throws ParserConfigurationException
+ * if an internal {@link DocumentBuilder} cannot be created
+ */
+ public WordToTextConverter() throws ParserConfigurationException
+ {
+ this.textDocumentFacade = new TextDocumentFacade(
+ DocumentBuilderFactory.newInstance().newDocumentBuilder()
+ .newDocument() );
+ }
+
/**
* Creates new instance of {@link WordToTextConverter}. Can be used for
* output several {@link HWPFDocument}s into single text document.
return textDocumentFacade.getDocument();
}
+ public String getText() throws Exception
+ {
+ StringWriter stringWriter = new StringWriter();
+ DOMSource domSource = new DOMSource( getDocument() );
+ StreamResult streamResult = new StreamResult( stringWriter );
+
+ TransformerFactory tf = TransformerFactory.newInstance();
+ Transformer serializer = tf.newTransformer();
+ // TODO set encoding from a command argument
+ serializer.setOutputProperty( OutputKeys.ENCODING, "UTF-8" );
+ serializer.setOutputProperty( OutputKeys.INDENT, "no" );
+ serializer.setOutputProperty( OutputKeys.METHOD, "text" );
+ serializer.transform( domSource, streamResult );
+
+ return stringWriter.toString();
+ }
+
+ public boolean isOutputSummaryInformation()
+ {
+ return outputSummaryInformation;
+ }
+
@Override
protected void outputCharacters( Element block, CharacterRun characterRun,
String text )
protected void processDocumentInformation(
SummaryInformation summaryInformation )
{
- if ( AbstractWordUtils.isNotEmpty( summaryInformation.getTitle() ) )
- textDocumentFacade.setTitle( summaryInformation.getTitle() );
+ if ( isOutputSummaryInformation() )
+ {
+ if ( AbstractWordUtils.isNotEmpty( summaryInformation.getTitle() ) )
+ textDocumentFacade.setTitle( summaryInformation.getTitle() );
- if ( AbstractWordUtils.isNotEmpty( summaryInformation.getAuthor() ) )
- textDocumentFacade.addAuthor( summaryInformation.getAuthor() );
+ if ( AbstractWordUtils.isNotEmpty( summaryInformation.getAuthor() ) )
+ textDocumentFacade.addAuthor( summaryInformation.getAuthor() );
- if ( AbstractWordUtils.isNotEmpty( summaryInformation.getComments() ) )
- textDocumentFacade
- .addDescription( summaryInformation.getComments() );
+ if ( AbstractWordUtils
+ .isNotEmpty( summaryInformation.getComments() ) )
+ textDocumentFacade.addDescription( summaryInformation
+ .getComments() );
- if ( AbstractWordUtils.isNotEmpty( summaryInformation.getKeywords() ) )
- textDocumentFacade.addKeywords( summaryInformation.getKeywords() );
+ if ( AbstractWordUtils
+ .isNotEmpty( summaryInformation.getKeywords() ) )
+ textDocumentFacade.addKeywords( summaryInformation
+ .getKeywords() );
+ }
}
@Override
note.appendChild( textDocumentFacade.createText( "\n" ) );
}
+ @Override
+ protected boolean processOle2( HWPFDocument wordDocument, Element block,
+ Entry entry ) throws Exception
+ {
+ if ( !( entry instanceof DirectoryNode ) )
+ return false;
+ DirectoryNode directoryNode = (DirectoryNode) entry;
+
+ // even if no ExtractorFactory in classpath
+ if ( directoryNode.hasEntry( "WordDocument" ) )
+ {
+ String text = WordToTextConverter.getText( (DirectoryNode) entry );
+ block.appendChild( textDocumentFacade
+ .createText( UNICODECHAR_ZERO_WIDTH_SPACE + text
+ + UNICODECHAR_ZERO_WIDTH_SPACE ) );
+ return true;
+ }
+
+ try
+ {
+ Class<?> cls = Class
+ .forName( "org.apache.poi.extractor.ExtractorFactory" );
+ Method createExtractor = cls.getMethod( "createExtractor",
+ DirectoryNode.class );
+ Object extractor = createExtractor.invoke( null, directoryNode );
+
+ Method getText = extractor.getClass().getMethod( "getText" );
+ String text = (String) getText.invoke( extractor );
+
+ block.appendChild( textDocumentFacade
+ .createText( UNICODECHAR_ZERO_WIDTH_SPACE + text
+ + UNICODECHAR_ZERO_WIDTH_SPACE ) );
+ return true;
+ }
+ catch ( ClassNotFoundException exc )
+ {
+ // no extractor in classpath
+ }
+
+ return false;
+ }
+
@Override
protected void processPageref( HWPFDocumentCore wordDocument,
Element currentBlock, Range textRange, int currentTableLevel,
textDocumentFacade.body.appendChild( sectionElement );
}
- protected void processTable( HWPFDocumentCore hwpfDocument, Element flow,
+ protected void processTable( HWPFDocumentCore wordDocument, Element flow,
Table table )
{
final int tableRows = table.numRows();
tableCellElement.appendChild( textDocumentFacade
.createText( "\t" ) );
- processParagraphes( hwpfDocument, tableCellElement, tableCell,
- table.getTableLevel() );
+ processCharacters( wordDocument, table.getTableLevel(),
+ tableCell, tableCellElement );
tableRowElement.appendChild( tableCellElement );
}
}
}
+ public void setOutputSummaryInformation( boolean outputDocumentInformation )
+ {
+ this.outputSummaryInformation = outputDocumentInformation;
+ }
+
}
import java.io.IOException;
import java.io.InputStream;
+import java.io.StringWriter;
+
+import org.apache.poi.hwpf.converter.WordToTextConverter;
+import org.apache.poi.hwpf.usermodel.HeaderStories;
import org.apache.poi.POIOLE2TextExtractor;
import org.apache.poi.hwpf.HWPFOldDocument;
this( new POIFSFileSystem(is) );
}
- /**
- * Create a new Word Extractor
- * @param fs POIFSFileSystem containing the word file
- */
- public Word6Extractor(POIFSFileSystem fs) throws IOException {
- this(fs.getRoot(), fs);
- }
- public Word6Extractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
- this(new HWPFOldDocument(dir,fs));
- }
+ /**
+ * Create a new Word Extractor
+ *
+ * @param fs
+ * POIFSFileSystem containing the word file
+ */
+ public Word6Extractor( POIFSFileSystem fs ) throws IOException
+ {
+ this( fs.getRoot() );
+ }
+
+ /**
+ * @deprecated Use {@link #Word6Extractor(DirectoryNode)} instead
+ */
+ @Deprecated
+ @SuppressWarnings( "unused" )
+ public Word6Extractor( DirectoryNode dir, POIFSFileSystem fs )
+ throws IOException
+ {
+ this( dir );
+ }
+
+ public Word6Extractor( DirectoryNode dir ) throws IOException
+ {
+ this( new HWPFOldDocument( dir ) );
+ }
/**
* Create a new Word Extractor
* Get the text from the word file, as an array with one String
* per paragraph
*/
+ @Deprecated
public String[] getParagraphText() {
String[] ret;
return ret;
}
- public String getText() {
- StringBuffer text = new StringBuffer();
-
- for(String t : getParagraphText()) {
- text.append(t);
+ public String getText()
+ {
+ try
+ {
+ WordToTextConverter wordToTextConverter = new WordToTextConverter();
+ wordToTextConverter.processDocument( doc );
+ return wordToTextConverter.getText();
}
+ catch ( Exception exc )
+ {
+ // fall-back
+ StringBuffer text = new StringBuffer();
- return text.toString();
+ for ( String t : getParagraphText() )
+ {
+ text.append( t );
+ }
+
+ return text.toString();
+ }
}
}
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
+import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Arrays;
+import org.apache.poi.hwpf.converter.WordToTextConverter;
+
import org.apache.poi.POIOLE2TextExtractor;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.usermodel.HeaderStories;
/**
* Class to extract the text from a Word Document.
- *
- * You should use either getParagraphText() or getText() unless
- * you have a strong reason otherwise.
- *
+ *
+ * You should use either getParagraphText() or getText() unless you have a
+ * strong reason otherwise.
+ *
* @author Nick Burch
*/
-public final class WordExtractor extends POIOLE2TextExtractor {
- private POIFSFileSystem fs;
- private HWPFDocument doc;
-
- /**
- * Create a new Word Extractor
- * @param is InputStream containing the word file
- */
- public WordExtractor(InputStream is) throws IOException {
- this( HWPFDocument.verifyAndBuildPOIFS(is) );
- }
-
- /**
- * Create a new Word Extractor
- * @param fs POIFSFileSystem containing the word file
- */
- public WordExtractor(POIFSFileSystem fs) throws IOException {
- this(new HWPFDocument(fs));
- this.fs = fs;
- }
- public WordExtractor(DirectoryNode dir, POIFSFileSystem fs) throws IOException {
- this(new HWPFDocument(dir, fs));
- this.fs = fs;
- }
-
- /**
- * Create a new Word Extractor
- * @param doc The HWPFDocument to extract from
- */
- public WordExtractor(HWPFDocument doc) {
- super(doc);
- this.doc = doc;
- }
-
- /**
- * Command line extractor, so people will stop moaning that
- * they can't just run this.
- */
- public static void main(String[] args) throws IOException {
- if(args.length == 0) {
- System.err.println("Use:");
- System.err.println(" java org.apache.poi.hwpf.extractor.WordExtractor <filename>");
- System.exit(1);
- }
-
- // Process the first argument as a file
- FileInputStream fin = new FileInputStream(args[0]);
- WordExtractor extractor = new WordExtractor(fin);
- System.out.println(extractor.getText());
- }
-
- /**
- * Get the text from the word file, as an array with one String
- * per paragraph
- */
- public String[] getParagraphText() {
- String[] ret;
-
- // Extract using the model code
- try {
- Range r = doc.getRange();
-
- ret = getParagraphText(r);
- } catch (Exception e) {
- // Something's up with turning the text pieces into paragraphs
- // Fall back to ripping out the text pieces
- ret = new String[1];
- ret[0] = getTextFromPieces();
- }
+public final class WordExtractor extends POIOLE2TextExtractor
+{
+ private HWPFDocument doc;
+
+ /**
+ * Create a new Word Extractor
+ *
+ * @param is
+ * InputStream containing the word file
+ */
+ public WordExtractor( InputStream is ) throws IOException
+ {
+ this( HWPFDocument.verifyAndBuildPOIFS( is ) );
+ }
+
+ /**
+ * Create a new Word Extractor
+ *
+ * @param fs
+ * POIFSFileSystem containing the word file
+ */
+ public WordExtractor( POIFSFileSystem fs ) throws IOException
+ {
+ this( new HWPFDocument( fs ) );
+ }
+
+ /**
+ * @deprecated Use {@link #WordExtractor(DirectoryNode)} instead
+ */
+ @Deprecated
+ public WordExtractor( DirectoryNode dir, POIFSFileSystem fs )
+ throws IOException
+ {
+ this( dir );
+ }
+
+ public WordExtractor( DirectoryNode dir ) throws IOException
+ {
+ this( new HWPFDocument( dir ) );
+ }
+
+ /**
+ * Create a new Word Extractor
+ *
+ * @param doc
+ * The HWPFDocument to extract from
+ */
+ public WordExtractor( HWPFDocument doc )
+ {
+ super( doc );
+ this.doc = doc;
+ }
+
+ /**
+ * Command line extractor, so people will stop moaning that they can't just
+ * run this.
+ */
+ public static void main( String[] args ) throws IOException
+ {
+ if ( args.length == 0 )
+ {
+ System.err.println( "Use:" );
+ System.err
+ .println( " java org.apache.poi.hwpf.extractor.WordExtractor <filename>" );
+ System.exit( 1 );
+ }
- return ret;
+ // Process the first argument as a file
+ FileInputStream fin = new FileInputStream( args[0] );
+ WordExtractor extractor = new WordExtractor( fin );
+ System.out.println( extractor.getText() );
+ }
+
+ /**
+ * Get the text from the word file, as an array with one String per
+ * paragraph
+ */
+ public String[] getParagraphText()
+ {
+ String[] ret;
+
+ // Extract using the model code
+ try
+ {
+ Range r = doc.getRange();
+
+ ret = getParagraphText( r );
+ }
+ catch ( Exception e )
+ {
+ // Something's up with turning the text pieces into paragraphs
+ // Fall back to ripping out the text pieces
+ ret = new String[1];
+ ret[0] = getTextFromPieces();
}
- public String[] getFootnoteText() {
- Range r = doc.getFootnoteRange();
+ return ret;
+ }
- return getParagraphText(r);
- }
+ public String[] getFootnoteText()
+ {
+ Range r = doc.getFootnoteRange();
- public String[] getMainTextboxText() {
- Range r = doc.getMainTextboxRange();
+ return getParagraphText( r );
+ }
- return getParagraphText(r);
- }
+ public String[] getMainTextboxText()
+ {
+ Range r = doc.getMainTextboxRange();
+
+ return getParagraphText( r );
+ }
+
+ public String[] getEndnoteText()
+ {
+ Range r = doc.getEndnoteRange();
+
+ return getParagraphText( r );
+ }
+
+ public String[] getCommentsText()
+ {
+ Range r = doc.getCommentsRange();
+
+ return getParagraphText( r );
+ }
- public String[] getEndnoteText() {
- Range r = doc.getEndnoteRange();
+ protected static String[] getParagraphText( Range r )
+ {
+ String[] ret;
+ ret = new String[r.numParagraphs()];
+ for ( int i = 0; i < ret.length; i++ )
+ {
+ Paragraph p = r.getParagraph( i );
+ ret[i] = p.text();
- return getParagraphText(r);
+ // Fix the line ending
+ if ( ret[i].endsWith( "\r" ) )
+ {
+ ret[i] = ret[i] + "\n";
+ }
+ }
+ return ret;
+ }
+
+ /**
+ * Add the header/footer text, if it's not empty
+ */
+ private void appendHeaderFooter( String text, StringBuffer out )
+ {
+ if ( text == null || text.length() == 0 )
+ return;
+
+ text = text.replace( '\r', '\n' );
+ if ( !text.endsWith( "\n" ) )
+ {
+ out.append( text );
+ out.append( '\n' );
+ return;
+ }
+ if ( text.endsWith( "\n\n" ) )
+ {
+ out.append( text.substring( 0, text.length() - 1 ) );
+ return;
+ }
+ out.append( text );
+ return;
+ }
+
+ /**
+ * Grab the text from the headers
+ */
+ @Deprecated
+ public String getHeaderText()
+ {
+ HeaderStories hs = new HeaderStories( doc );
+
+ StringBuffer ret = new StringBuffer();
+ if ( hs.getFirstHeader() != null )
+ {
+ appendHeaderFooter( hs.getFirstHeader(), ret );
+ }
+ if ( hs.getEvenHeader() != null )
+ {
+ appendHeaderFooter( hs.getEvenHeader(), ret );
+ }
+ if ( hs.getOddHeader() != null )
+ {
+ appendHeaderFooter( hs.getOddHeader(), ret );
}
- public String[] getCommentsText() {
- Range r = doc.getCommentsRange();
+ return ret.toString();
+ }
+
+ /**
+ * Grab the text from the footers
+ */
+ @Deprecated
+ public String getFooterText()
+ {
+ HeaderStories hs = new HeaderStories( doc );
+
+ StringBuffer ret = new StringBuffer();
+ if ( hs.getFirstFooter() != null )
+ {
+ appendHeaderFooter( hs.getFirstFooter(), ret );
+ }
+ if ( hs.getEvenFooter() != null )
+ {
+ appendHeaderFooter( hs.getEvenFooter(), ret );
+ }
+ if ( hs.getOddFooter() != null )
+ {
+ appendHeaderFooter( hs.getOddFooter(), ret );
+ }
- return getParagraphText(r);
+ return ret.toString();
+ }
+
+ /**
+ * Grab the text out of the text pieces. Might also include various bits of
+ * crud, but will work in cases where the text piece -> paragraph mapping is
+ * broken. Fast too.
+ */
+ public String getTextFromPieces()
+ {
+ String text = doc.getDocumentText();
+
+ // Fix line endings (Note - won't get all of them
+ text = text.replaceAll( "\r\r\r", "\r\n\r\n\r\n" );
+ text = text.replaceAll( "\r\r", "\r\n\r\n" );
+
+ if ( text.endsWith( "\r" ) )
+ {
+ text += "\n";
}
- protected static String[] getParagraphText(Range r) {
- String[] ret;
- ret = new String[r.numParagraphs()];
- for (int i = 0; i < ret.length; i++) {
- Paragraph p = r.getParagraph(i);
- ret[i] = p.text();
-
- // Fix the line ending
- if (ret[i].endsWith("\r")) {
- ret[i] = ret[i] + "\n";
- }
+ return text;
+ }
+
+ /**
+ * Grab the text, based on the WordToTextConverter. Shouldn't include any
+ * crud, but slower than getTextFromPieces().
+ */
+ public String getText()
+ {
+ try
+ {
+ final StringWriter stringWriter = new StringWriter();
+ @SuppressWarnings( "unused" )
+ WordToTextConverter wordToTextConverter = new WordToTextConverter()
+ {
+ {
+ HeaderStories hs = new HeaderStories( doc );
+
+ if ( hs.getFirstHeaderSubrange() != null )
+ processDocumentPart( doc, hs.getFirstHeaderSubrange() );
+ if ( hs.getEvenHeaderSubrange() != null )
+ processDocumentPart( doc, hs.getEvenHeaderSubrange() );
+ if ( hs.getOddHeaderSubrange() != null )
+ processDocumentPart( doc, hs.getOddHeaderSubrange() );
+
+ processDocument( doc );
+ processDocumentPart( doc, doc.getMainTextboxRange() );
+
+ if ( hs.getFirstFooterSubrange() != null )
+ processDocumentPart( doc, hs.getFirstFooterSubrange() );
+ if ( hs.getEvenFooterSubrange() != null )
+ processDocumentPart( doc, hs.getEvenFooterSubrange() );
+ if ( hs.getOddFooterSubrange() != null )
+ processDocumentPart( doc, hs.getOddFooterSubrange() );
+
+ stringWriter.append( getText() );
}
- return ret;
+ };
+ return stringWriter.toString();
}
-
- /**
- * Add the header/footer text, if it's not empty
- */
- private void appendHeaderFooter(String text, StringBuffer out) {
- if(text == null || text.length() == 0)
- return;
-
- text = text.replace('\r', '\n');
- if(! text.endsWith("\n")) {
- out.append(text);
- out.append('\n');
- return;
- }
- if(text.endsWith("\n\n")) {
- out.append(text.substring(0, text.length()-1));
- return;
- }
- out.append(text);
- return;
- }
- /**
- * Grab the text from the headers
- */
- public String getHeaderText() {
- HeaderStories hs = new HeaderStories(doc);
-
- StringBuffer ret = new StringBuffer();
- if(hs.getFirstHeader() != null) {
- appendHeaderFooter(hs.getFirstHeader(), ret);
- }
- if(hs.getEvenHeader() != null) {
- appendHeaderFooter(hs.getEvenHeader(), ret);
- }
- if(hs.getOddHeader() != null) {
- appendHeaderFooter(hs.getOddHeader(), ret);
- }
-
- return ret.toString();
- }
- /**
- * Grab the text from the footers
- */
- public String getFooterText() {
- HeaderStories hs = new HeaderStories(doc);
-
- StringBuffer ret = new StringBuffer();
- if(hs.getFirstFooter() != null) {
- appendHeaderFooter(hs.getFirstFooter(), ret);
- }
- if(hs.getEvenFooter() != null) {
- appendHeaderFooter(hs.getEvenFooter(), ret);
- }
- if(hs.getOddFooter() != null) {
- appendHeaderFooter(hs.getOddFooter(), ret);
- }
-
- return ret.toString();
- }
-
- /**
- * Grab the text out of the text pieces. Might also include various
- * bits of crud, but will work in cases where the text piece -> paragraph
- * mapping is broken. Fast too.
- */
- public String getTextFromPieces() {
- String text = doc.getDocumentText();
-
- // Fix line endings (Note - won't get all of them
- text = text.replaceAll("\r\r\r", "\r\n\r\n\r\n");
- text = text.replaceAll("\r\r", "\r\n\r\n");
-
- if(text.endsWith("\r")) {
- text += "\n";
- }
-
- return text;
- }
-
- /**
- * Grab the text, based on the paragraphs. Shouldn't include any crud,
- * but slightly slower than getTextFromPieces().
- */
- public String getText() {
- StringBuffer ret = new StringBuffer();
-
- ret.append(getHeaderText());
-
- ArrayList<String> text = new ArrayList<String>();
- text.addAll(Arrays.asList(getParagraphText()));
- text.addAll(Arrays.asList(getMainTextboxText()));
- text.addAll(Arrays.asList(getFootnoteText()));
- text.addAll(Arrays.asList(getEndnoteText()));
-
- for(String p : text) {
- ret.append(p);
- }
-
- ret.append(getFooterText());
-
- return ret.toString();
- }
-
- /**
- * Removes any fields (eg macros, page markers etc)
- * from the string.
- */
- public static String stripFields(String text) {
- return Range.stripFields(text);
- }
+ catch ( Exception exc )
+ {
+ throw new RuntimeException( exc );
+ }
+ }
+
+ /**
+ * Removes any fields (eg macros, page markers etc) from the string.
+ */
+ public static String stripFields( String text )
+ {
+ return Range.stripFields( text );
+ }
}
*/
int getFieldStartOffset();
+ CharacterRun getMarkEndCharacterRun( Range parent );
+
/**
* @return character position of end field mark
*/
int getMarkEndOffset();
+ CharacterRun getMarkSeparatorCharacterRun( Range parent );
+
/**
* @return character position of separator field mark (if present,
* {@link NullPointerException} otherwise)
*/
int getMarkSeparatorOffset();
+ CharacterRun getMarkStartCharacterRun( Range parent );
+
/**
* @return character position of start field mark
*/
return startPlex.getFcStart();
}
+ public CharacterRun getMarkEndCharacterRun( Range parent )
+ {
+ return new Range( getMarkEndOffset(), getMarkEndOffset() + 1, parent )
+ .getCharacterRun( 0 );
+ }
+
/**
* @return character position of end field mark
*/
return endPlex.getFcStart();
}
+ public CharacterRun getMarkSeparatorCharacterRun( Range parent )
+ {
+ if ( !hasSeparator() )
+ return null;
+
+ return new Range( getMarkSeparatorOffset(),
+ getMarkSeparatorOffset() + 1, parent ).getCharacterRun( 0 );
+ }
+
/**
* @return character position of separator field mark (if present,
* {@link NullPointerException} otherwise)
return separatorPlex.getFcStart();
}
+ public CharacterRun getMarkStartCharacterRun( Range parent )
+ {
+ return new Range( getMarkStartOffset(), getMarkStartOffset() + 1,
+ parent ).getCharacterRun( 0 );
+ }
+
/**
* @return character position of start field mark
*/
fib.getPlcfHddSize(), 0 );
}
- public String getFootnoteSeparator() {
- return getAt(0);
- }
- public String getFootnoteContSeparator() {
- return getAt(1);
- }
- public String getFootnoteContNote() {
- return getAt(2);
- }
- public String getEndnoteSeparator() {
- return getAt(3);
- }
- public String getEndnoteContSeparator() {
- return getAt(4);
- }
- public String getEndnoteContNote() {
- return getAt(5);
- }
+ @Deprecated
+ public String getFootnoteSeparator()
+ {
+ return getAt( 0 );
+ }
+
+ @Deprecated
+ public String getFootnoteContSeparator()
+ {
+ return getAt( 1 );
+ }
+
+ @Deprecated
+ public String getFootnoteContNote()
+ {
+ return getAt( 2 );
+ }
+
+ @Deprecated
+ public String getEndnoteSeparator()
+ {
+ return getAt( 3 );
+ }
+ @Deprecated
+ public String getEndnoteContSeparator()
+ {
+ return getAt( 4 );
+ }
+
+ @Deprecated
+ public String getEndnoteContNote()
+ {
+ return getAt( 5 );
+ }
+
+ public Range getFootnoteSeparatorSubrange()
+ {
+ return getSubrangeAt( 0 );
+ }
+
+ public Range getFootnoteContSeparatorSubrange()
+ {
+ return getSubrangeAt( 1 );
+ }
+
+ public Range getFootnoteContNoteSubrange()
+ {
+ return getSubrangeAt( 2 );
+ }
+
+ public Range getEndnoteSeparatorSubrange()
+ {
+ return getSubrangeAt( 3 );
+ }
+
+ public Range getEndnoteContSeparatorSubrange()
+ {
+ return getSubrangeAt( 4 );
+ }
+ public Range getEndnoteContNoteSubrange()
+ {
+ return getSubrangeAt( 5 );
+ }
+
+ @Deprecated
public String getEvenHeader() {
return getAt(6+0);
}
+ @Deprecated
public String getOddHeader() {
return getAt(6+1);
}
+ @Deprecated
public String getFirstHeader() {
return getAt(6+4);
}
+
+
+ public Range getEvenHeaderSubrange() {
+ return getSubrangeAt(6+0);
+ }
+ public Range getOddHeaderSubrange() {
+ return getSubrangeAt(6+1);
+ }
+ public Range getFirstHeaderSubrange() {
+ return getSubrangeAt(6+4);
+ }
+
/**
* Returns the correct, defined header for the given
* one based page
return getOddHeader();
}
+ @Deprecated
+ public String getEvenFooter()
+ {
+ return getAt( 6 + 2 );
+ }
+
+ @Deprecated
+ public String getOddFooter()
+ {
+ return getAt( 6 + 3 );
+ }
+
+ @Deprecated
+ public String getFirstFooter()
+ {
+ return getAt( 6 + 5 );
+ }
+
+ public Range getEvenFooterSubrange()
+ {
+ return getSubrangeAt( 6 + 2 );
+ }
+
+ public Range getOddFooterSubrange()
+ {
+ return getSubrangeAt( 6 + 3 );
+ }
+
+ public Range getFirstFooterSubrange()
+ {
+ return getSubrangeAt( 6 + 5 );
+ }
- public String getEvenFooter() {
- return getAt(6+2);
- }
- public String getOddFooter() {
- return getAt(6+3);
- }
- public String getFirstFooter() {
- return getAt(6+5);
- }
/**
* Returns the correct, defined footer for the given
* one based page
* Get the string that's pointed to by the
* given plcfHdd index
*/
+ @Deprecated
private String getAt(int plcfHddIndex) {
if(plcfHdd == null) return null;
return text;
}
+ private Range getSubrangeAt( int plcfHddIndex )
+ {
+ if ( plcfHdd == null )
+ return null;
+
+ GenericPropertyNode prop = plcfHdd.getProperty( plcfHddIndex );
+ if ( prop.getStart() == prop.getEnd() )
+ {
+ // Empty story
+ return null;
+ }
+ if ( prop.getEnd() < prop.getStart() )
+ {
+ // Broken properties?
+ return null;
+ }
+
+ final int headersLength = headerStories.getEndOffset()
+ - headerStories.getStartOffset();
+ int start = Math.min( prop.getStart(), headersLength );
+ int end = Math.min( prop.getEnd(), headersLength );
+
+ return new Range( headerStories.getStartOffset() + start,
+ headerStories.getStartOffset() + end, headerStories );
+ }
+
public Range getRange() {
return headerStories;
}
--- /dev/null
+package org.apache.poi.hwpf.usermodel;
+
+import java.io.FileNotFoundException;
+
+import org.apache.poi.poifs.filesystem.DirectoryEntry;
+import org.apache.poi.poifs.filesystem.Entry;
+import org.apache.poi.util.Internal;
+
+@Internal
+public class ObjectPoolImpl implements ObjectsPool
+{
+ private DirectoryEntry _objectPool;
+
+ public ObjectPoolImpl( DirectoryEntry _objectPool )
+ {
+ super();
+ this._objectPool = _objectPool;
+ }
+
+ public Entry getObjectById( String objId )
+ {
+ if ( _objectPool == null )
+ return null;
+
+ try
+ {
+ return _objectPool.getEntry( objId );
+ }
+ catch ( FileNotFoundException exc )
+ {
+ return null;
+ }
+ }
+}
--- /dev/null
+package org.apache.poi.hwpf.usermodel;
+
+import org.apache.poi.poifs.filesystem.Entry;
+
+public interface ObjectsPool
+{
+ public Entry getObjectById( String objId );
+}
import org.apache.poi.hwpf.sprm.ParagraphSprmCompressor;
import org.apache.poi.hwpf.sprm.SprmBuffer;
import org.apache.poi.util.LittleEndian;
+import org.apache.poi.util.POILogFactory;
+import org.apache.poi.util.POILogger;
/**
* This class is the central class of the HWPF object model. All properties that
*/
public class Range { // TODO -instantiable superclass
+ private POILogger logger = POILogFactory.getLogger( Range.class );
+
public static final int TYPE_PARAGRAPH = 0;
public static final int TYPE_CHARACTER = 1;
public static final int TYPE_SECTION = 2;
initAll();
if ( tableEndInclusive >= this._parEnd )
{
- throw new ArrayIndexOutOfBoundsException(
- "The table's bounds fall outside of this Range" );
+ logger.log( POILogger.WARN, "The table's bounds ", "["
+ + this._parStart + "; " + tableEndInclusive + ")",
+ " fall outside of this Range paragraphs numbers ", "["
+ + this._parStart + "; " + this._parEnd + ")" );
}
+
if ( tableEndInclusive < 0 )
{
throw new ArrayIndexOutOfBoundsException(
--- /dev/null
+package org.apache.poi.hwpf.converter;
+
+import junit.framework.TestCase;
+import org.apache.poi.hwpf.HWPFDocument;
+import org.apache.poi.hwpf.HWPFTestDataSamples;
+
+public class TestWordToTextConverter extends TestCase
+{
+
+ /**
+ * [FAILING] Bug 47731 - Word Extractor considers text copied from some
+ * website as an embedded object
+ */
+ public void testBug47731() throws Exception
+ {
+ HWPFDocument doc = HWPFTestDataSamples.openSampleFile( "Bug47731.doc" );
+ String foundText = WordToTextConverter.getText( doc );
+
+ assertTrue( foundText
+ .contains( "Soak the rice in water for three to four hours" ) );
+ }
+}
* @author Nick Burch (nick at torchbox dot com)
*/
public final class TestWordExtractor extends TestCase {
+
+ public static void assertEquals( String expected, String actual )
+ {
+ String newExpected = expected.replaceAll( "\r\n", "\n" )
+ .replaceAll( "\r", "\n" ).trim();
+ String newActual = actual.replaceAll( "\r\n", "\n" )
+ .replaceAll( "\r", "\n" ).trim();
+ TestCase.assertEquals( newExpected, newActual );
+ }
+
private String[] p_text1 = new String[] {
"This is a simple word document\r\n",
"\r\n",
public void testGetText() {
assertEquals(p_text1_block, extractor.getText());
- // For the 2nd, should give similar answers for
- // the two methods, differing only in line endings
- assertEquals(
- extractor2.getTextFromPieces().replaceAll("[\\r\\n]", ""),
- extractor2.getText().replaceAll("[\\r\\n]", ""));
- }
+ // For the 2nd, should give similar answers for
+ // the two methods, differing only in line endings
+
+ // nope, they must have different results, because of garbage
+ // assertEquals(
+ // extractor2.getTextFromPieces().replaceAll("[\\r\\n]", ""),
+ // extractor2.getText().replaceAll("[\\r\\n]", ""));
+ }
/**
* Test textPieces based extraction
// Open directly
for(DirectoryNode dir : files) {
- WordExtractor extractor = new WordExtractor(dir, null);
+ WordExtractor extractor = new WordExtractor(dir);
assertEquals(p_text1_block, extractor.getText());
}
public class TestBugs extends TestCase
{
+ public static void assertEquals( String expected, String actual )
+ {
+ String newExpected = expected.replaceAll( "\r\n", "\n" )
+ .replaceAll( "\r", "\n" ).trim();
+ String newActual = actual.replaceAll( "\r\n", "\n" )
+ .replaceAll( "\r", "\n" ).trim();
+ TestCase.assertEquals( newExpected, newActual );
+ }
+
private static void assertTableStructures( Range expected, Range actual )
{
assertEquals( expected.numParagraphs(), actual.numParagraphs() );