From 97fb171369ed0ee344790448071e276de874cee4 Mon Sep 17 00:00:00 2001 From: Nick Burch Date: Wed, 27 Jun 2007 18:34:17 +0000 Subject: [PATCH] Lots more HDGF support for chunks, and add support for basic text extraction git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@551258 13f79535-47bb-0310-9956-ffa450edef68 --- .../src/org/apache/poi/hdgf/chunks/Chunk.java | 21 +++- .../poi/hdgf/chunks/ChunkHeaderV11.java | 4 + .../poi/hdgf/chunks/ChunkSeparator.java | 4 + .../apache/poi/hdgf/chunks/ChunkTrailer.java | 4 + .../org/apache/poi/hdgf/dev/VSDDumper.java | 8 ++ .../hdgf/extractor/VisioTextExtractor.java | 113 ++++++++++++++++++ .../apache/poi/hdgf/streams/ChunkStream.java | 5 + .../org/apache/poi/hdgf/streams/Stream.java | 2 +- .../poi/hdgf/streams/StringsStream.java | 7 +- .../poi/hdgf/streams/TestStreamComplex.java | 60 ++++++++++ 10 files changed, 224 insertions(+), 4 deletions(-) create mode 100644 src/scratchpad/src/org/apache/poi/hdgf/extractor/VisioTextExtractor.java diff --git a/src/scratchpad/src/org/apache/poi/hdgf/chunks/Chunk.java b/src/scratchpad/src/org/apache/poi/hdgf/chunks/Chunk.java index 54c37b3e83..673d56edf3 100644 --- a/src/scratchpad/src/org/apache/poi/hdgf/chunks/Chunk.java +++ b/src/scratchpad/src/org/apache/poi/hdgf/chunks/Chunk.java @@ -20,6 +20,7 @@ import java.util.ArrayList; import org.apache.poi.hdgf.chunks.ChunkFactory.CommandDefinition; import org.apache.poi.util.LittleEndian; +import org.apache.poi.util.StringUtil; /** * Base of all chunks, which hold data, flags etc @@ -167,9 +168,27 @@ public class Chunk { LittleEndian.getDouble(contents, offset) ); break; + case 12: + // A Little Endian String + // Starts 8 bytes into the data segment + // Ends at end of data, or 00 00 + int startsAt = 8; + int endsAt = startsAt; + for(int j=startsAt; j"; + } } diff --git a/src/scratchpad/src/org/apache/poi/hdgf/chunks/ChunkTrailer.java b/src/scratchpad/src/org/apache/poi/hdgf/chunks/ChunkTrailer.java index a610b49b14..a590732466 100644 --- a/src/scratchpad/src/org/apache/poi/hdgf/chunks/ChunkTrailer.java +++ b/src/scratchpad/src/org/apache/poi/hdgf/chunks/ChunkTrailer.java @@ -26,4 +26,8 @@ public class ChunkTrailer { trailerData = new byte[8]; System.arraycopy(data, offset, trailerData, 0, 8); } + + public String toString() { + return ""; + } } diff --git a/src/scratchpad/src/org/apache/poi/hdgf/dev/VSDDumper.java b/src/scratchpad/src/org/apache/poi/hdgf/dev/VSDDumper.java index 3c20e4f3ff..614b9259a0 100644 --- a/src/scratchpad/src/org/apache/poi/hdgf/dev/VSDDumper.java +++ b/src/scratchpad/src/org/apache/poi/hdgf/dev/VSDDumper.java @@ -70,6 +70,11 @@ public class VSDDumper { " - " + Integer.toHexString(ptr.getFormat())); System.out.println(ind + " Length is\t" + ptr.getLength() + " - " + Integer.toHexString(ptr.getLength())); + if(ptr.destinationCompressed()) { + int decompLen = stream._getContentsLength(); + System.out.println(ind + " DC.Length is\t" + decompLen + + " - " + Integer.toHexString(decompLen)); + } System.out.println(ind + " Compressed is\t" + ptr.destinationCompressed()); System.out.println(ind + " Stream is\t" + stream.getClass().getName()); @@ -100,6 +105,9 @@ public class VSDDumper { for(int i=0; i"); + System.exit(1); + } + + VisioTextExtractor extractor = + new VisioTextExtractor(new FileInputStream(args[0])); + System.out.println(extractor.getText()); + } +} diff --git a/src/scratchpad/src/org/apache/poi/hdgf/streams/ChunkStream.java b/src/scratchpad/src/org/apache/poi/hdgf/streams/ChunkStream.java index 75b6beefd1..a59fe43ff9 100644 --- a/src/scratchpad/src/org/apache/poi/hdgf/streams/ChunkStream.java +++ b/src/scratchpad/src/org/apache/poi/hdgf/streams/ChunkStream.java @@ -43,6 +43,11 @@ public class ChunkStream extends Stream { public void findChunks() { ArrayList chunksA = new ArrayList(); + if(getPointer().getOffset() == 0x64b3) { + int i = 0; + i++; + } + int pos = 0; byte[] contents = getStore().getContents(); while(pos < contents.length) { diff --git a/src/scratchpad/src/org/apache/poi/hdgf/streams/Stream.java b/src/scratchpad/src/org/apache/poi/hdgf/streams/Stream.java index 35aa7e5291..163fa83d9a 100644 --- a/src/scratchpad/src/org/apache/poi/hdgf/streams/Stream.java +++ b/src/scratchpad/src/org/apache/poi/hdgf/streams/Stream.java @@ -83,7 +83,7 @@ public abstract class Stream { return new ChunkStream(pointer, store, chunkFactory); } else if(pointer.destinationHasStrings()) { - return new StringsStream(pointer, store); + return new StringsStream(pointer, store, chunkFactory); } // Give up and return a generic one diff --git a/src/scratchpad/src/org/apache/poi/hdgf/streams/StringsStream.java b/src/scratchpad/src/org/apache/poi/hdgf/streams/StringsStream.java index 2688b156e9..b23ff92149 100644 --- a/src/scratchpad/src/org/apache/poi/hdgf/streams/StringsStream.java +++ b/src/scratchpad/src/org/apache/poi/hdgf/streams/StringsStream.java @@ -16,13 +16,16 @@ limitations under the License. ==================================================================== */ package org.apache.poi.hdgf.streams; +import org.apache.poi.hdgf.chunks.ChunkFactory; import org.apache.poi.hdgf.pointers.Pointer; /** - * A Stream which holds Strings + * A Stream which holds Strings. This is just another kind + * of ChunkStream, it seems */ public class StringsStream extends Stream { - protected StringsStream(Pointer pointer, StreamStore store) { + protected StringsStream(Pointer pointer, StreamStore store, ChunkFactory chunkFactory) { super(pointer, store); +// super(pointer, store, chunkFactory); } } diff --git a/src/scratchpad/testcases/org/apache/poi/hdgf/streams/TestStreamComplex.java b/src/scratchpad/testcases/org/apache/poi/hdgf/streams/TestStreamComplex.java index c2d03f0c89..5ea21d1a1c 100644 --- a/src/scratchpad/testcases/org/apache/poi/hdgf/streams/TestStreamComplex.java +++ b/src/scratchpad/testcases/org/apache/poi/hdgf/streams/TestStreamComplex.java @@ -18,6 +18,7 @@ package org.apache.poi.hdgf.streams; import java.io.FileInputStream; +import org.apache.poi.hdgf.chunks.Chunk; import org.apache.poi.hdgf.chunks.ChunkFactory; import org.apache.poi.hdgf.pointers.Pointer; import org.apache.poi.hdgf.pointers.PointerFactory; @@ -202,4 +203,63 @@ public class TestStreamComplex extends StreamTest { assertTrue(s8451.getPointedToStreams()[0] instanceof StringsStream); assertTrue(s8451.getPointedToStreams()[1] instanceof StringsStream); } + + public void testChunkWithText() throws Exception { + // Parent ChunkStream is at 0x7194 + // This is one of the last children of the trailer + Pointer trailerPtr = ptrFactory.createPointer(contents, trailerPointerAt); + TrailerStream ts = (TrailerStream) + Stream.createStream(trailerPtr, contents, chunkFactory, ptrFactory); + + ts.findChildren(contents); + + assertNotNull(ts.getChildPointers()); + assertNotNull(ts.getPointedToStreams()); + assertEquals(20, ts.getChildPointers().length); + assertEquals(20, ts.getPointedToStreams().length); + + assertEquals(0x7194, ts.getChildPointers()[13].getOffset()); + assertEquals(0x7194, ts.getPointedToStreams()[13].getPointer().getOffset()); + + PointerContainingStream ps7194 = (PointerContainingStream) + ts.getPointedToStreams()[13]; + + // First child is at 0x64b3 + assertEquals(0x64b3, ps7194.getChildPointers()[0].getOffset()); + assertEquals(0x64b3, ps7194.getPointedToStreams()[0].getPointer().getOffset()); + + ChunkStream cs = (ChunkStream)ps7194.getPointedToStreams()[0]; + + // Should be 26bc bytes un-compressed + assertEquals(0x26bc, cs.getStore().getContents().length); + // And should have lots of children + assertEquals(131, cs.getChunks().length); + + // One of which is Text + boolean hasText = false; + for(int i=0; i