]> source.dussan.org Git - poi.git/commitdiff
Lots more HDGF support for chunks, and add support for basic text extraction
authorNick Burch <nick@apache.org>
Wed, 27 Jun 2007 18:34:17 +0000 (18:34 +0000)
committerNick Burch <nick@apache.org>
Wed, 27 Jun 2007 18:34:17 +0000 (18:34 +0000)
git-svn-id: https://svn.apache.org/repos/asf/poi/trunk@551258 13f79535-47bb-0310-9956-ffa450edef68

src/scratchpad/src/org/apache/poi/hdgf/chunks/Chunk.java
src/scratchpad/src/org/apache/poi/hdgf/chunks/ChunkHeaderV11.java
src/scratchpad/src/org/apache/poi/hdgf/chunks/ChunkSeparator.java
src/scratchpad/src/org/apache/poi/hdgf/chunks/ChunkTrailer.java
src/scratchpad/src/org/apache/poi/hdgf/dev/VSDDumper.java
src/scratchpad/src/org/apache/poi/hdgf/extractor/VisioTextExtractor.java [new file with mode: 0644]
src/scratchpad/src/org/apache/poi/hdgf/streams/ChunkStream.java
src/scratchpad/src/org/apache/poi/hdgf/streams/Stream.java
src/scratchpad/src/org/apache/poi/hdgf/streams/StringsStream.java
src/scratchpad/testcases/org/apache/poi/hdgf/streams/TestStreamComplex.java

index 54c37b3e8304a22ffbdc80418dcca4b5a34de66a..673d56edf3372c723db4a8b35587f6e85aca1c7e 100644 (file)
@@ -20,6 +20,7 @@ import java.util.ArrayList;
 
 import org.apache.poi.hdgf.chunks.ChunkFactory.CommandDefinition;
 import org.apache.poi.util.LittleEndian;
+import org.apache.poi.util.StringUtil;
 
 /**
  * Base of all chunks, which hold data, flags etc
@@ -167,9 +168,27 @@ public class Chunk {
                                                LittleEndian.getDouble(contents, offset)
                                );
                                break;
+                       case 12:
+                               // A Little Endian String
+                               // Starts 8 bytes into the data segment
+                               // Ends at end of data, or 00 00
+                               int startsAt = 8;
+                               int endsAt = startsAt;
+                               for(int j=startsAt; j<contents.length-1 && endsAt == startsAt; j++) {
+                                       if(contents[j] == 0 && contents[j+1] == 0) {
+                                               endsAt = j;
+                                       }
+                               }
+                               if(endsAt == startsAt) {
+                                       endsAt = contents.length;
+                               }
+                               
+                               int strLen = (endsAt-startsAt) / 2;
+                               command.value = StringUtil.getFromUnicodeLE(contents, startsAt, strLen);
+                               break;
                        case 25:
                                command.value = new Short(
-                                               LittleEndian.getShort(contents, offset)
+                                       LittleEndian.getShort(contents, offset)
                                );
                                break;
                        case 26:
index 51eca5649c7be7429fe95b27015067a8b6cf563a..c77a249204bbbaa2d4c6445e3220a98440dee64f 100644 (file)
@@ -24,6 +24,10 @@ public class ChunkHeaderV11 extends ChunkHeaderV6 {
         * Does the chunk have a separator?
         */
        public boolean hasSeparator() {
+               // For some reason, there are two types that don't have a 
+               //  separator despite the flags that indicate they do
+               if(type == 0x1f || type == 0xc9) { return false; }
+               
                // If there's a trailer, there's a separator
                if(hasTrailer()) { return true; }
 
index 7098f17ea68f12e5210d199f6f53f5a1412916d1..5ce4097446915130e20ad6c57dd07b75f43ff4f5 100644 (file)
@@ -27,4 +27,8 @@ public class ChunkSeparator {
                separatorData = new byte[4];
                System.arraycopy(data, offset, separatorData, 0, 4);
        }
+       
+       public String toString() {
+               return "<ChunkSeparator of length " + separatorData.length + ">";
+       }
 }
index a610b49b14a82a50400db9b880a309df1c93d4b7..a590732466b9b58ddbd2a354ff0acf5263f27c85 100644 (file)
@@ -26,4 +26,8 @@ public class ChunkTrailer {
                trailerData = new byte[8];
                System.arraycopy(data, offset, trailerData, 0, 8);
        }
+       
+       public String toString() {
+               return "<ChunkTrailer of length " + trailerData.length + ">";
+       }
 }
index 3c20e4f3ffa64e5f44346dca348a0c7219aca632..614b9259a0cdada28c326f5b2f8fb2cd9d9e2295 100644 (file)
@@ -70,6 +70,11 @@ public class VSDDumper {
                                " - " + Integer.toHexString(ptr.getFormat()));
                System.out.println(ind + "  Length is\t" + ptr.getLength() +
                                " - " + Integer.toHexString(ptr.getLength()));
+               if(ptr.destinationCompressed()) {
+                       int decompLen = stream._getContentsLength();
+                       System.out.println(ind + "  DC.Length is\t" + decompLen +
+                                       " - " + Integer.toHexString(decompLen));
+               }
                System.out.println(ind + "  Compressed is\t" + ptr.destinationCompressed());
                System.out.println(ind + "  Stream is\t" + stream.getClass().getName());
                
@@ -100,6 +105,9 @@ public class VSDDumper {
                        for(int i=0; i<cs.getChunks().length; i++) {
                                Chunk chunk = cs.getChunks()[i];
                                System.out.println(ind2 + "" + chunk.getName());
+                               System.out.println(ind2 + "  Length is " + chunk._getContents().length + " (" + Integer.toHexString(chunk._getContents().length) + ")");
+                               System.out.println(ind2 + "  OD Size is " + chunk.getOnDiskSize() + " (" + Integer.toHexString(chunk.getOnDiskSize()) + ")");
+                               System.out.println(ind2 + "  T / S is " + chunk.getTrailer() + " / " + chunk.getSeparator());
                                System.out.println(ind2 + "  Holds " + chunk.getCommands().length + " commands");
                                for(int j=0; j<chunk.getCommands().length; j++) {
                                        Command command = chunk.getCommands()[j];
diff --git a/src/scratchpad/src/org/apache/poi/hdgf/extractor/VisioTextExtractor.java b/src/scratchpad/src/org/apache/poi/hdgf/extractor/VisioTextExtractor.java
new file mode 100644 (file)
index 0000000..f7ef6df
--- /dev/null
@@ -0,0 +1,113 @@
+/* ====================================================================
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+==================================================================== */
+package org.apache.poi.hdgf.extractor;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+
+import org.apache.poi.hdgf.HDGFDiagram;
+import org.apache.poi.hdgf.chunks.Chunk.Command;
+import org.apache.poi.hdgf.streams.ChunkStream;
+import org.apache.poi.hdgf.streams.PointerContainingStream;
+import org.apache.poi.hdgf.streams.Stream;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+
+/**
+ * Class to find all the text in a Visio file, and return it.
+ * Can opperate on the command line (outputs to stdout), or
+ *  can return the text for you (eg for use with Lucene).
+ */
+public class VisioTextExtractor {
+       private HDGFDiagram hdgf;
+       private POIFSFileSystem fs;
+
+       public VisioTextExtractor(HDGFDiagram hdgf) {
+               this.hdgf = hdgf;
+       }
+       public VisioTextExtractor(POIFSFileSystem fs) throws IOException {
+               this(new HDGFDiagram(fs));
+               this.fs = fs;
+       }
+       public VisioTextExtractor(InputStream inp) throws IOException {
+               this(new POIFSFileSystem(inp));
+       }
+       
+       /**
+        * Locates all the text entries in the file, and returns their
+        *  contents.
+        */
+       public String[] getAllText() {
+               ArrayList text = new ArrayList();
+               for(int i=0; i<hdgf.getTopLevelStreams().length; i++) {
+                       findText(hdgf.getTopLevelStreams()[i], text);
+               }
+               System.err.println("Found " + text.size() + " text string");
+               return (String[])text.toArray( new String[text.size()] );
+       }
+       private void findText(Stream stream, ArrayList text) {
+               if(stream instanceof PointerContainingStream) {
+                       PointerContainingStream ps = (PointerContainingStream)stream;
+                       for(int i=0; i<ps.getPointedToStreams().length; i++) {
+                               findText(ps.getPointedToStreams()[i], text);
+                       }
+               }
+               if(stream instanceof ChunkStream) {
+                       ChunkStream cs = (ChunkStream)stream;
+                       for(int i=0; i<cs.getChunks().length; i++) {
+                               if(cs.getChunks()[i] != null && 
+                                               cs.getChunks()[i].getName() != null &&
+                                               cs.getChunks()[i].getName().equals("Text")) {
+                                       // First command
+                                       Command cmd = cs.getChunks()[i].getCommands()[0];
+                                       if(cmd != null && cmd.getValue() != null) {
+                                               text.add( cmd.getValue().toString() );
+                                       }
+                               }
+                       }
+               }
+       }
+       
+       /**
+        * Returns the textual contents of the file.
+        */
+       public String getText() {
+               StringBuffer text = new StringBuffer();
+               String[] allText = getAllText();
+               for(int i=0; i<allText.length; i++) {
+                       text.append(allText[i]);
+                       if(!allText[i].endsWith("\r") &&
+                                       !allText[i].endsWith("\n")) {
+                               text.append("\n");
+                       }
+               }
+               return text.toString();
+       }
+       
+       public static void main(String[] args) throws Exception {
+               if(args.length == 0) {
+                       System.err.println("Use:");
+                       System.err.println("   VisioTextExtractor <file.vsd>");
+                       System.exit(1);
+               }
+               
+               VisioTextExtractor extractor = 
+                       new VisioTextExtractor(new FileInputStream(args[0]));
+               System.out.println(extractor.getText());
+       }
+}
index 75b6beefd1a04ea732c9b4c960ecb4af1bb86617..a59fe43ff924900667b3619b91ea107ac9ea2bd4 100644 (file)
@@ -43,6 +43,11 @@ public class ChunkStream extends Stream {
        public void findChunks() {
                ArrayList chunksA = new ArrayList();
                
+               if(getPointer().getOffset() == 0x64b3) {
+                       int i = 0;
+                       i++;
+               }
+               
                int pos = 0;
                byte[] contents = getStore().getContents();
                while(pos < contents.length) {
index 35aa7e5291a8ec8d860377347be1ff0a57936661..163fa83d9a19104259506a56ea349316c2154a9f 100644 (file)
@@ -83,7 +83,7 @@ public abstract class Stream {
                        return new ChunkStream(pointer, store, chunkFactory); 
                }
                else if(pointer.destinationHasStrings()) {
-                       return new StringsStream(pointer, store);
+                       return new StringsStream(pointer, store, chunkFactory);
                }
                
                // Give up and return a generic one
index 2688b156e989037e28546cd1e1ca2c2af8ba3664..b23ff921493b84a5ab9968fa4b4873bdc78b5d94 100644 (file)
@@ -16,13 +16,16 @@ limitations under the License.
 ==================================================================== */
 package org.apache.poi.hdgf.streams;
 
+import org.apache.poi.hdgf.chunks.ChunkFactory;
 import org.apache.poi.hdgf.pointers.Pointer;
 
 /**
- * A Stream which holds Strings
+ * A Stream which holds Strings. This is just another kind
+ *  of ChunkStream, it seems
  */
 public class StringsStream extends Stream {
-       protected StringsStream(Pointer pointer, StreamStore store) {
+       protected StringsStream(Pointer pointer, StreamStore store, ChunkFactory chunkFactory) {
                super(pointer, store);
+//             super(pointer, store, chunkFactory);
        }
 }
index c2d03f0c8992afc9e9d6a85ac2850a1de46c137c..5ea21d1a1c016e4e6501d5ed9a8b2cbbedac6a7c 100644 (file)
@@ -18,6 +18,7 @@ package org.apache.poi.hdgf.streams;
 
 import java.io.FileInputStream;
 
+import org.apache.poi.hdgf.chunks.Chunk;
 import org.apache.poi.hdgf.chunks.ChunkFactory;
 import org.apache.poi.hdgf.pointers.Pointer;
 import org.apache.poi.hdgf.pointers.PointerFactory;
@@ -202,4 +203,63 @@ public class TestStreamComplex extends StreamTest {
                assertTrue(s8451.getPointedToStreams()[0] instanceof StringsStream);
                assertTrue(s8451.getPointedToStreams()[1] instanceof StringsStream);
        }
+       
+       public void testChunkWithText() throws Exception {
+               // Parent ChunkStream is at 0x7194
+               // This is one of the last children of the trailer
+               Pointer trailerPtr = ptrFactory.createPointer(contents, trailerPointerAt);
+               TrailerStream ts = (TrailerStream)
+                       Stream.createStream(trailerPtr, contents, chunkFactory, ptrFactory);
+               
+               ts.findChildren(contents);
+               
+               assertNotNull(ts.getChildPointers());
+               assertNotNull(ts.getPointedToStreams());
+               assertEquals(20, ts.getChildPointers().length);
+               assertEquals(20, ts.getPointedToStreams().length);
+               
+               assertEquals(0x7194, ts.getChildPointers()[13].getOffset());
+               assertEquals(0x7194, ts.getPointedToStreams()[13].getPointer().getOffset());
+               
+               PointerContainingStream ps7194 = (PointerContainingStream)
+                       ts.getPointedToStreams()[13];
+               
+               // First child is at 0x64b3
+               assertEquals(0x64b3, ps7194.getChildPointers()[0].getOffset());
+               assertEquals(0x64b3, ps7194.getPointedToStreams()[0].getPointer().getOffset());
+               
+               ChunkStream cs = (ChunkStream)ps7194.getPointedToStreams()[0];
+               
+               // Should be 26bc bytes un-compressed
+               assertEquals(0x26bc, cs.getStore().getContents().length);
+               // And should have lots of children
+               assertEquals(131, cs.getChunks().length);
+               
+               // One of which is Text
+               boolean hasText = false;
+               for(int i=0; i<cs.getChunks().length; i++) {
+                       if(cs.getChunks()[i].getName().equals("Text")) {
+                               hasText = true;
+                       }
+               }
+               assertTrue(hasText);
+               // Which is the 72nd command
+               assertEquals("Text", cs.getChunks()[72].getName());
+               
+               Chunk text = cs.getChunks()[72];
+               assertEquals("Text", text.getName());
+               
+               // Which contains our text
+               assertEquals(1, text.getCommands().length);
+               assertEquals("Test View\n", text.getCommands()[0].getValue());
+               
+               
+               // Almost at the end is some more text
+               assertEquals("Text", cs.getChunks()[128].getName());
+               text = cs.getChunks()[128];
+               assertEquals("Text", text.getName());
+               
+               assertEquals(1, text.getCommands().length);
+               assertEquals("Some random text, on a page\n", text.getCommands()[0].getValue());
+       }
 }