import org.apache.poi.hdgf.chunks.ChunkFactory.CommandDefinition;
import org.apache.poi.util.LittleEndian;
+import org.apache.poi.util.StringUtil;
/**
* Base of all chunks, which hold data, flags etc
LittleEndian.getDouble(contents, offset)
);
break;
+ case 12:
+ // A Little Endian String
+ // Starts 8 bytes into the data segment
+ // Ends at end of data, or 00 00
+ int startsAt = 8;
+ int endsAt = startsAt;
+ for(int j=startsAt; j<contents.length-1 && endsAt == startsAt; j++) {
+ if(contents[j] == 0 && contents[j+1] == 0) {
+ endsAt = j;
+ }
+ }
+ if(endsAt == startsAt) {
+ endsAt = contents.length;
+ }
+
+ int strLen = (endsAt-startsAt) / 2;
+ command.value = StringUtil.getFromUnicodeLE(contents, startsAt, strLen);
+ break;
case 25:
command.value = new Short(
- LittleEndian.getShort(contents, offset)
+ LittleEndian.getShort(contents, offset)
);
break;
case 26:
* Does the chunk have a separator?
*/
public boolean hasSeparator() {
+ // For some reason, there are two types that don't have a
+ // separator despite the flags that indicate they do
+ if(type == 0x1f || type == 0xc9) { return false; }
+
// If there's a trailer, there's a separator
if(hasTrailer()) { return true; }
separatorData = new byte[4];
System.arraycopy(data, offset, separatorData, 0, 4);
}
+
+ public String toString() {
+ return "<ChunkSeparator of length " + separatorData.length + ">";
+ }
}
trailerData = new byte[8];
System.arraycopy(data, offset, trailerData, 0, 8);
}
+
+ public String toString() {
+ return "<ChunkTrailer of length " + trailerData.length + ">";
+ }
}
" - " + Integer.toHexString(ptr.getFormat()));
System.out.println(ind + " Length is\t" + ptr.getLength() +
" - " + Integer.toHexString(ptr.getLength()));
+ if(ptr.destinationCompressed()) {
+ int decompLen = stream._getContentsLength();
+ System.out.println(ind + " DC.Length is\t" + decompLen +
+ " - " + Integer.toHexString(decompLen));
+ }
System.out.println(ind + " Compressed is\t" + ptr.destinationCompressed());
System.out.println(ind + " Stream is\t" + stream.getClass().getName());
for(int i=0; i<cs.getChunks().length; i++) {
Chunk chunk = cs.getChunks()[i];
System.out.println(ind2 + "" + chunk.getName());
+ System.out.println(ind2 + " Length is " + chunk._getContents().length + " (" + Integer.toHexString(chunk._getContents().length) + ")");
+ System.out.println(ind2 + " OD Size is " + chunk.getOnDiskSize() + " (" + Integer.toHexString(chunk.getOnDiskSize()) + ")");
+ System.out.println(ind2 + " T / S is " + chunk.getTrailer() + " / " + chunk.getSeparator());
System.out.println(ind2 + " Holds " + chunk.getCommands().length + " commands");
for(int j=0; j<chunk.getCommands().length; j++) {
Command command = chunk.getCommands()[j];
--- /dev/null
+/* ====================================================================
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+==================================================================== */
+package org.apache.poi.hdgf.extractor;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+
+import org.apache.poi.hdgf.HDGFDiagram;
+import org.apache.poi.hdgf.chunks.Chunk.Command;
+import org.apache.poi.hdgf.streams.ChunkStream;
+import org.apache.poi.hdgf.streams.PointerContainingStream;
+import org.apache.poi.hdgf.streams.Stream;
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+
+/**
+ * Class to find all the text in a Visio file, and return it.
+ * Can opperate on the command line (outputs to stdout), or
+ * can return the text for you (eg for use with Lucene).
+ */
+public class VisioTextExtractor {
+ private HDGFDiagram hdgf;
+ private POIFSFileSystem fs;
+
+ public VisioTextExtractor(HDGFDiagram hdgf) {
+ this.hdgf = hdgf;
+ }
+ public VisioTextExtractor(POIFSFileSystem fs) throws IOException {
+ this(new HDGFDiagram(fs));
+ this.fs = fs;
+ }
+ public VisioTextExtractor(InputStream inp) throws IOException {
+ this(new POIFSFileSystem(inp));
+ }
+
+ /**
+ * Locates all the text entries in the file, and returns their
+ * contents.
+ */
+ public String[] getAllText() {
+ ArrayList text = new ArrayList();
+ for(int i=0; i<hdgf.getTopLevelStreams().length; i++) {
+ findText(hdgf.getTopLevelStreams()[i], text);
+ }
+ System.err.println("Found " + text.size() + " text string");
+ return (String[])text.toArray( new String[text.size()] );
+ }
+ private void findText(Stream stream, ArrayList text) {
+ if(stream instanceof PointerContainingStream) {
+ PointerContainingStream ps = (PointerContainingStream)stream;
+ for(int i=0; i<ps.getPointedToStreams().length; i++) {
+ findText(ps.getPointedToStreams()[i], text);
+ }
+ }
+ if(stream instanceof ChunkStream) {
+ ChunkStream cs = (ChunkStream)stream;
+ for(int i=0; i<cs.getChunks().length; i++) {
+ if(cs.getChunks()[i] != null &&
+ cs.getChunks()[i].getName() != null &&
+ cs.getChunks()[i].getName().equals("Text")) {
+ // First command
+ Command cmd = cs.getChunks()[i].getCommands()[0];
+ if(cmd != null && cmd.getValue() != null) {
+ text.add( cmd.getValue().toString() );
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * Returns the textual contents of the file.
+ */
+ public String getText() {
+ StringBuffer text = new StringBuffer();
+ String[] allText = getAllText();
+ for(int i=0; i<allText.length; i++) {
+ text.append(allText[i]);
+ if(!allText[i].endsWith("\r") &&
+ !allText[i].endsWith("\n")) {
+ text.append("\n");
+ }
+ }
+ return text.toString();
+ }
+
+ public static void main(String[] args) throws Exception {
+ if(args.length == 0) {
+ System.err.println("Use:");
+ System.err.println(" VisioTextExtractor <file.vsd>");
+ System.exit(1);
+ }
+
+ VisioTextExtractor extractor =
+ new VisioTextExtractor(new FileInputStream(args[0]));
+ System.out.println(extractor.getText());
+ }
+}
public void findChunks() {
ArrayList chunksA = new ArrayList();
+ if(getPointer().getOffset() == 0x64b3) {
+ int i = 0;
+ i++;
+ }
+
int pos = 0;
byte[] contents = getStore().getContents();
while(pos < contents.length) {
return new ChunkStream(pointer, store, chunkFactory);
}
else if(pointer.destinationHasStrings()) {
- return new StringsStream(pointer, store);
+ return new StringsStream(pointer, store, chunkFactory);
}
// Give up and return a generic one
==================================================================== */
package org.apache.poi.hdgf.streams;
+import org.apache.poi.hdgf.chunks.ChunkFactory;
import org.apache.poi.hdgf.pointers.Pointer;
/**
- * A Stream which holds Strings
+ * A Stream which holds Strings. This is just another kind
+ * of ChunkStream, it seems
*/
public class StringsStream extends Stream {
- protected StringsStream(Pointer pointer, StreamStore store) {
+ protected StringsStream(Pointer pointer, StreamStore store, ChunkFactory chunkFactory) {
super(pointer, store);
+// super(pointer, store, chunkFactory);
}
}
import java.io.FileInputStream;
+import org.apache.poi.hdgf.chunks.Chunk;
import org.apache.poi.hdgf.chunks.ChunkFactory;
import org.apache.poi.hdgf.pointers.Pointer;
import org.apache.poi.hdgf.pointers.PointerFactory;
assertTrue(s8451.getPointedToStreams()[0] instanceof StringsStream);
assertTrue(s8451.getPointedToStreams()[1] instanceof StringsStream);
}
+
+ public void testChunkWithText() throws Exception {
+ // Parent ChunkStream is at 0x7194
+ // This is one of the last children of the trailer
+ Pointer trailerPtr = ptrFactory.createPointer(contents, trailerPointerAt);
+ TrailerStream ts = (TrailerStream)
+ Stream.createStream(trailerPtr, contents, chunkFactory, ptrFactory);
+
+ ts.findChildren(contents);
+
+ assertNotNull(ts.getChildPointers());
+ assertNotNull(ts.getPointedToStreams());
+ assertEquals(20, ts.getChildPointers().length);
+ assertEquals(20, ts.getPointedToStreams().length);
+
+ assertEquals(0x7194, ts.getChildPointers()[13].getOffset());
+ assertEquals(0x7194, ts.getPointedToStreams()[13].getPointer().getOffset());
+
+ PointerContainingStream ps7194 = (PointerContainingStream)
+ ts.getPointedToStreams()[13];
+
+ // First child is at 0x64b3
+ assertEquals(0x64b3, ps7194.getChildPointers()[0].getOffset());
+ assertEquals(0x64b3, ps7194.getPointedToStreams()[0].getPointer().getOffset());
+
+ ChunkStream cs = (ChunkStream)ps7194.getPointedToStreams()[0];
+
+ // Should be 26bc bytes un-compressed
+ assertEquals(0x26bc, cs.getStore().getContents().length);
+ // And should have lots of children
+ assertEquals(131, cs.getChunks().length);
+
+ // One of which is Text
+ boolean hasText = false;
+ for(int i=0; i<cs.getChunks().length; i++) {
+ if(cs.getChunks()[i].getName().equals("Text")) {
+ hasText = true;
+ }
+ }
+ assertTrue(hasText);
+ // Which is the 72nd command
+ assertEquals("Text", cs.getChunks()[72].getName());
+
+ Chunk text = cs.getChunks()[72];
+ assertEquals("Text", text.getName());
+
+ // Which contains our text
+ assertEquals(1, text.getCommands().length);
+ assertEquals("Test View\n", text.getCommands()[0].getValue());
+
+
+ // Almost at the end is some more text
+ assertEquals("Text", cs.getChunks()[128].getName());
+ text = cs.getChunks()[128];
+ assertEquals("Text", text.getName());
+
+ assertEquals(1, text.getCommands().length);
+ assertEquals("Some random text, on a page\n", text.getCommands()[0].getValue());
+ }
}