123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933 |
- /* ====================================================================
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- ==================================================================== */
- package org.apache.poi.hwpf.usermodel;
-
- import static org.junit.Assert.assertEquals;
- import static org.junit.Assert.assertFalse;
- import static org.junit.Assert.assertNotNull;
- import static org.junit.Assert.assertTrue;
-
- import java.io.ByteArrayOutputStream;
- import java.io.File;
- import java.io.IOException;
- import java.io.InputStream;
- import java.util.Arrays;
- import java.util.Collection;
- import java.util.List;
-
- import org.apache.commons.codec.digest.DigestUtils;
- import org.apache.poi.POIDataSamples;
- import org.apache.poi.hwpf.HWPFDocument;
- import org.apache.poi.hwpf.HWPFOldDocument;
- import org.apache.poi.hwpf.HWPFTestDataSamples;
- import org.apache.poi.hwpf.converter.AbstractWordUtils;
- import org.apache.poi.hwpf.converter.WordToTextConverter;
- import org.apache.poi.hwpf.extractor.Word6Extractor;
- import org.apache.poi.hwpf.extractor.WordExtractor;
- import org.apache.poi.hwpf.model.FieldsDocumentPart;
- import org.apache.poi.hwpf.model.FileInformationBlock;
- import org.apache.poi.hwpf.model.PlexOfField;
- import org.apache.poi.hwpf.model.SubdocumentType;
- import org.apache.poi.hwpf.model.io.HWPFOutputStream;
- import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
- import org.apache.poi.poifs.filesystem.POIFSFileSystem;
- import org.apache.poi.util.IOUtils;
- import org.apache.poi.util.POILogFactory;
- import org.apache.poi.util.POILogger;
- import org.junit.Test;
-
- import junit.framework.TestCase;
-
- /**
- * Test different problems reported in the Apache Bugzilla
- * against HWPF
- */
- public class TestBugs{
-
- private static final POILogger logger = POILogFactory.getLogger(TestBugs.class);
-
- public static void assertEqualsIgnoreNewline(String expected, String actual )
- {
- String newExpected = expected.replaceAll("\r\n", "\n" )
- .replaceAll("\r", "\n").trim();
- String newActual = actual.replaceAll("\r\n", "\n" )
- .replaceAll("\r", "\n").trim();
- TestCase.assertEquals(newExpected, newActual);
- }
-
- private static void assertTableStructures(Range expected, Range actual )
- {
- assertEquals(expected.numParagraphs(), actual.numParagraphs());
- for (int p = 0; p < expected.numParagraphs(); p++ )
- {
- Paragraph expParagraph = expected.getParagraph(p);
- Paragraph actParagraph = actual.getParagraph(p);
-
- assertEqualsIgnoreNewline(expParagraph.text(), actParagraph.text());
- assertEquals("Diffent isInTable flags for paragraphs #" + p
- + " -- " + expParagraph + " -- " + actParagraph + ".",
- expParagraph.isInTable(), actParagraph.isInTable());
- assertEquals(expParagraph.isTableRowEnd(),
- actParagraph.isTableRowEnd());
-
- if (expParagraph.isInTable() && actParagraph.isInTable() )
- {
- Table expTable, actTable;
- try
- {
- expTable = expected.getTable(expParagraph);
- actTable = actual.getTable(actParagraph);
- }
- catch (Exception exc )
- {
- continue;
- }
-
- assertEquals(expTable.numRows(), actTable.numRows());
- assertEquals(expTable.numParagraphs(),
- actTable.numParagraphs());
- }
- }
- }
-
- private String getText(String samplefile) throws IOException {
- HWPFDocument doc = HWPFTestDataSamples.openSampleFile(samplefile);
- WordExtractor extractor = new WordExtractor(doc);
- try {
- return extractor.getText();
- } finally {
- extractor.close();
- doc.close();
- }
- }
-
- private String getTextOldFile(String samplefile) throws IOException {
- HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile(samplefile);
- Word6Extractor extractor = new Word6Extractor(doc);
- try {
- return extractor.getText();
- } finally {
- extractor.close();
- doc.close();
- }
- }
-
- /**
- * Bug 33519 - HWPF fails to read a file
- */
- @Test
- public void test33519() throws IOException
- {
- assertNotNull(getText("Bug33519.doc"));
- }
-
- /**
- * Bug 34898 - WordExtractor doesn't read the whole string from the file
- */
- @Test
- public void test34898() throws IOException
- {
- assertEqualsIgnoreNewline("\u30c7\u30a3\u30ec\u30af\u30c8\u30ea", getText("Bug34898.doc").trim());
- }
-
- /**
- * [RESOLVED INVALID] 41898 - Word 2003 pictures cannot be extracted
- */
- @Test
- public void test41898() throws IOException {
- HWPFDocument doc = HWPFTestDataSamples.openSampleFile("Bug41898.doc");
- List<Picture> pics = doc.getPicturesTable().getAllPictures();
-
- assertNotNull(pics);
- assertEquals(1, pics.size());
-
- Picture pic = pics.get(0);
- assertNotNull(pic.suggestFileExtension());
- assertNotNull(pic.suggestFullFileName());
-
- assertNotNull(pic.getContent());
- assertNotNull(pic.getRawContent());
-
- /*
- * This is a file with empty EMF image, but present Office Drawing
- * --sergey
- */
- final Collection<OfficeDrawing> officeDrawings = doc
- .getOfficeDrawingsMain().getOfficeDrawings();
- assertNotNull(officeDrawings);
- assertEquals(1, officeDrawings.size());
-
- OfficeDrawing officeDrawing = officeDrawings.iterator().next();
- assertNotNull(officeDrawing);
- assertEquals(1044, officeDrawing.getShapeId());
- doc.close();
- }
-
- /**
- * Bug 44331 - HWPFDocument.write destroys fields
- */
- @SuppressWarnings("deprecation")
- @Test
- public void test44431() throws IOException
- {
- HWPFDocument doc1 = HWPFTestDataSamples.openSampleFile("Bug44431.doc");
-
- WordExtractor extractor1 = new WordExtractor(doc1);
- try {
- HWPFDocument doc2 = HWPFTestDataSamples.writeOutAndReadBack(doc1);
-
- WordExtractor extractor2 = new WordExtractor(doc2);
- try {
- assertEqualsIgnoreNewline(extractor1.getFooterText(), extractor2.getFooterText());
- assertEqualsIgnoreNewline(extractor1.getHeaderText(), extractor2.getHeaderText());
- assertEqualsIgnoreNewline(Arrays.toString(extractor1.getParagraphText() ),
- Arrays.toString(extractor2.getParagraphText()));
-
- assertEqualsIgnoreNewline(extractor1.getText(), extractor2.getText());
- } finally {
- extractor2.close();
- }
- } finally {
- extractor1.close();
- doc1.close();
- }
- }
-
- /**
- * Bug 44331 - HWPFDocument.write destroys fields
- */
- @Test
- public void test44431_2() throws IOException
- {
- assertEqualsIgnoreNewline("File name=FieldsTest.doc\n" +
- "\n" +
- "\n" +
- "STYLEREF test\n" +
- "\n" +
- "\n" +
- "\n" +
- "TEST TABLE OF CONTENTS\n" +
- "\n" +
- "Heading paragraph in next page\t2\n" +
- "Another heading paragraph in further page\t3\n" +
- "Another heading paragraph in further page\t3\n" +
- "\n" +
- "\n" +
- "Heading paragraph in next page\n" +
- "Another heading paragraph in further page\n" +
- "\n" +
- "\n" +
- "\n" +
- "Page 3 of 3", getText("Bug44431.doc"));
- }
-
- /**
- * Bug 45473 - HWPF cannot read file after save
- */
- @Test
- public void test45473() throws IOException
- {
- // Fetch the current text
- HWPFDocument doc1 = HWPFTestDataSamples.openSampleFile("Bug45473.doc");
- WordExtractor wordExtractor = new WordExtractor(doc1);
- final String text1;
- try {
- text1 = wordExtractor.getText().trim();
- } finally {
- wordExtractor.close();
- doc1.close();
- }
-
- // Re-load, then re-save and re-check
- doc1 = HWPFTestDataSamples.openSampleFile("Bug45473.doc");
- HWPFDocument doc2 = HWPFTestDataSamples.writeOutAndReadBack(doc1);
- WordExtractor wordExtractor2 = new WordExtractor(doc2);
- final String text2;
- try {
- text2 = wordExtractor2.getText().trim();
- } finally {
- wordExtractor2.close();
- doc1.close();
- }
-
- // the text in the saved document has some differences in line
- // separators but we tolerate that
- assertEqualsIgnoreNewline(text1.replaceAll("\n", "" ), text2.replaceAll("\n", ""));
- }
-
- /**
- * Bug 46220 - images are not properly extracted
- */
- @Test
- public void test46220() throws IOException {
- HWPFDocument doc = HWPFTestDataSamples.openSampleFile("Bug46220.doc");
- // reference checksums as in Bugzilla
- String[] md5 = { "851be142bce6d01848e730cb6903f39e",
- "7fc6d8fb58b09ababd036d10a0e8c039",
- "a7dc644c40bc2fbf17b2b62d07f99248",
- "72d07b8db5fad7099d90bc4c304b4666" };
- List<Picture> pics = doc.getPicturesTable().getAllPictures();
- assertEquals(4, pics.size());
- for (int i = 0; i < pics.size(); i++ )
- {
- Picture pic = pics.get(i);
- byte[] data = pic.getRawContent();
- // use Apache Commons Codec utils to compute md5
- assertEqualsIgnoreNewline(md5[i], DigestUtils.md5Hex(data));
- }
- doc.close();
- }
-
- /**
- * [RESOLVED FIXED] Bug 46817 - Regression: Text from some table cells
- * missing
- */
- @Test
- public void test46817() throws IOException
- {
- String text = getText("Bug46817.doc").trim();
-
- assertTrue(text.contains("Nazwa wykonawcy"));
- assertTrue(text.contains("kujawsko-pomorskie"));
- assertTrue(text.contains("ekomel@ekomel.com.pl"));
- }
-
- /**
- * [FAILING] Bug 47286 - Word documents saves in wrong format if source
- * contains form elements
- */
- @SuppressWarnings("deprecation")
- @Test
- public void test47286() throws IOException
- {
- // Fetch the current text
- HWPFDocument doc1 = HWPFTestDataSamples.openSampleFile("Bug47286.doc");
- WordExtractor wordExtractor = new WordExtractor(doc1);
- final String text1;
- try {
- text1 = wordExtractor.getText().trim();
- } finally {
- wordExtractor.close();
- doc1.close();
- }
-
- // Re-load, then re-save and re-check
- doc1 = HWPFTestDataSamples.openSampleFile("Bug47286.doc");
- HWPFDocument doc2 = HWPFTestDataSamples.writeOutAndReadBack(doc1);
- WordExtractor wordExtractor2 = new WordExtractor(doc2);
- final String text2;
- try {
- text2 = wordExtractor2.getText().trim();
- } finally {
- wordExtractor2.close();
- doc1.close();
- }
-
- // the text in the saved document has some differences in line
- // separators but we tolerate that
- assertEqualsIgnoreNewline(text1.replaceAll("\n", "" ), text2.replaceAll("\n", ""));
-
- assertEquals(doc1.getCharacterTable().getTextRuns().size(), doc2
- .getCharacterTable().getTextRuns().size());
-
- List<PlexOfField> expectedFields = doc1.getFieldsTables()
- .getFieldsPLCF(FieldsDocumentPart.MAIN);
- List<PlexOfField> actualFields = doc2.getFieldsTables().getFieldsPLCF(
- FieldsDocumentPart.MAIN);
- assertEquals(expectedFields.size(), actualFields.size());
-
- assertTableStructures(doc1.getRange(), doc2.getRange());
- }
-
- /**
- * [RESOLVED FIXED] Bug 47287 - StringIndexOutOfBoundsException in
- * CharacterRun.replaceText()
- */
- @Test
- public void test47287()
- {
- HWPFDocument doc = HWPFTestDataSamples.openSampleFile("Bug47287.doc");
- String[] values = { "1-1", "1-2", "1-3", "1-4", "1-5", "1-6", "1-7",
- "1-8", "1-9", "1-10", "1-11", "1-12", "1-13", "1-14", "1-15", };
- int usedVal = 0;
- String PLACEHOLDER = "\u2002\u2002\u2002\u2002\u2002";
- Range r = doc.getRange();
- for (int x = 0; x < r.numSections(); x++ )
- {
- Section s = r.getSection(x);
- for (int y = 0; y < s.numParagraphs(); y++ )
- {
- Paragraph p = s.getParagraph(y);
-
- for (int z = 0; z < p.numCharacterRuns(); z++ )
- {
- boolean isFound = false;
-
- // character run
- CharacterRun run = p.getCharacterRun(z);
- // character run text
- String text = run.text();
- String oldText = text;
- int c = text.indexOf("FORMTEXT ");
- if (c < 0 )
- {
- int k = text.indexOf(PLACEHOLDER);
- if (k >= 0 )
- {
- text = text.substring(0, k ) + values[usedVal]
- + text.substring(k + PLACEHOLDER.length());
- usedVal++;
- isFound = true;
- }
- }
- else
- {
- for (; c >= 0; c = text.indexOf("FORMTEXT ", c
- + "FORMTEXT ".length() ) )
- {
- int k = text.indexOf(PLACEHOLDER, c);
- if (k >= 0 )
- {
- text = text.substring(0, k )
- + values[usedVal]
- + text.substring(k
- + PLACEHOLDER.length());
- usedVal++;
- isFound = true;
- }
- }
- }
- if (isFound )
- {
- run.replaceText(oldText, text, 0);
- }
-
- }
- }
- }
-
- String docText = r.text();
-
- assertTrue(docText.contains("1-1"));
- assertTrue(docText.contains("1-12"));
-
- assertFalse(docText.contains("1-13"));
- assertFalse(docText.contains("1-15"));
- }
-
- /**
- * [RESOLVED FIXED] Bug 47731 - Word Extractor considers text copied from
- * some website as an embedded object
- */
- @Test
- public void test47731() throws Exception
- {
- String foundText = getText("Bug47731.doc");
-
- assertTrue(foundText
- .contains("Soak the rice in water for three to four hours"));
- }
-
- /**
- * Bug 4774 - text extracted by WordExtractor is broken
- */
- @Test
- public void test47742() throws Exception
- {
- // (1) extract text from MS Word document via POI
- String foundText = getText("Bug47742.doc");
-
- // (2) read text from text document (retrieved by saving the word
- // document as text file using encoding UTF-8)
- InputStream is = POIDataSamples.getDocumentInstance()
- .openResourceAsStream("Bug47742-text.txt");
- try {
- byte[] expectedBytes = IOUtils.toByteArray(is);
- String expectedText = new String(expectedBytes, "utf-8" )
- .substring(1); // strip-off the unicode marker
-
- assertEqualsIgnoreNewline(expectedText, foundText);
- } finally {
- is.close();
- }
- }
-
- /**
- * Bug 47958 - Exception during Escher walk of pictures
- */
- @Test
- public void test47958()
- {
- HWPFDocument doc = HWPFTestDataSamples.openSampleFile("Bug47958.doc");
- doc.getPicturesTable().getAllPictures();
- }
-
- /**
- * [RESOLVED FIXED] Bug 48065 - Problems with save output of HWPF (losing
- * formatting)
- */
- @Test
- public void test48065()
- {
- HWPFDocument doc1 = HWPFTestDataSamples.openSampleFile("Bug48065.doc");
- HWPFDocument doc2 = HWPFTestDataSamples.writeOutAndReadBack(doc1);
-
- Range expected = doc1.getRange();
- Range actual = doc2.getRange();
-
- assertEqualsIgnoreNewline(
- expected.text().replace("\r", "\n").replaceAll("\n\n", "\n" ),
- actual.text().replace("\r", "\n").replaceAll("\n\n", "\n"));
-
- assertTableStructures(expected, actual);
- }
-
- @Test
- public void test49933() throws IOException
- {
- String text = getTextOldFile("Bug49933.doc");
-
- assertTrue( text.contains( "best.wine.jump.ru" ) );
- }
-
- /**
- * Bug 50936 - Exception parsing MS Word 8.0 file
- */
- @Test
- public void test50936_1()
- {
- HWPFDocument hwpfDocument = HWPFTestDataSamples
- .openSampleFile("Bug50936_1.doc");
- hwpfDocument.getPicturesTable().getAllPictures();
- }
-
- /**
- * Bug 50936 - Exception parsing MS Word 8.0 file
- */
- @Test
- public void test50936_2()
- {
- HWPFDocument hwpfDocument = HWPFTestDataSamples
- .openSampleFile("Bug50936_2.doc");
- hwpfDocument.getPicturesTable().getAllPictures();
- }
-
- /**
- * Bug 50936 - Exception parsing MS Word 8.0 file
- */
- @Test
- public void test50936_3()
- {
- HWPFDocument hwpfDocument = HWPFTestDataSamples
- .openSampleFile("Bug50936_3.doc");
- hwpfDocument.getPicturesTable().getAllPictures();
- }
-
-
- /**
- * [RESOLVED FIXED] Bug 51604 - replace text fails for doc (poi 3.8 beta
- * release from download site )
- */
- @Test
- public void test51604()
- {
- HWPFDocument document = HWPFTestDataSamples
- .openSampleFile("Bug51604.doc");
-
- Range range = document.getRange();
- int numParagraph = range.numParagraphs();
- int counter = 0;
- for (int i = 0; i < numParagraph; i++ )
- {
- Paragraph paragraph = range.getParagraph(i);
- int numCharRuns = paragraph.numCharacterRuns();
- for (int j = 0; j < numCharRuns; j++ )
- {
- CharacterRun charRun = paragraph.getCharacterRun(j);
- String text = charRun.text();
- charRun.replaceText(text, "+" + (++counter));
- }
- }
-
- document = HWPFTestDataSamples.writeOutAndReadBack(document);
- String text = document.getDocumentText();
- assertEqualsIgnoreNewline("+1+2+3+4+5+6+7+8+9+10+11+12", text);
- }
-
- /**
- * [RESOLVED FIXED] Bug 51604 - replace text fails for doc (poi 3.8 beta
- * release from download site )
- */
- @Test
- public void test51604p2() throws Exception
- {
- HWPFDocument doc = HWPFTestDataSamples.openSampleFile("Bug51604.doc");
-
- Range range = doc.getRange();
- int numParagraph = range.numParagraphs();
- replaceText(range, numParagraph);
-
- doc = HWPFTestDataSamples.writeOutAndReadBack(doc);
- final FileInformationBlock fileInformationBlock = doc
- .getFileInformationBlock();
-
- int totalLength = 0;
- for (SubdocumentType type : SubdocumentType.values() )
- {
- final int partLength = fileInformationBlock
- .getSubdocumentTextStreamLength(type);
- assert (partLength >= 0);
-
- totalLength += partLength;
- }
- assertEquals(doc.getText().length(), totalLength);
- }
-
- private void replaceText(Range range, int numParagraph) {
- for (int i = 0; i < numParagraph; i++ )
- {
- Paragraph paragraph = range.getParagraph(i);
- int numCharRuns = paragraph.numCharacterRuns();
- for (int j = 0; j < numCharRuns; j++ )
- {
- CharacterRun charRun = paragraph.getCharacterRun(j);
- String text = charRun.text();
- if (text.contains("Header" ) ) {
- charRun.replaceText(text, "added");
- }
- }
- }
- }
-
- /**
- * [RESOLVED FIXED] Bug 51604 - replace text fails for doc (poi 3.8 beta
- * release from download site )
- */
- @Test
- public void test51604p3() throws Exception
- {
- HWPFDocument doc = HWPFTestDataSamples.openSampleFile("Bug51604.doc");
-
- byte[] originalData = new byte[doc.getFileInformationBlock()
- .getLcbDop()];
- System.arraycopy(doc.getTableStream(), doc.getFileInformationBlock()
- .getFcDop(), originalData, 0, originalData.length);
-
- HWPFOutputStream outputStream = new HWPFOutputStream();
- doc.getDocProperties().writeTo(outputStream);
- final byte[] oldData = outputStream.toByteArray();
-
- assertEqualsIgnoreNewline(Arrays.toString(originalData ),
- Arrays.toString(oldData));
-
- Range range = doc.getRange();
- int numParagraph = range.numParagraphs();
- replaceText(range, numParagraph);
-
- doc = HWPFTestDataSamples.writeOutAndReadBack(doc);
-
- outputStream = new HWPFOutputStream();
- doc.getDocProperties().writeTo(outputStream);
- final byte[] newData = outputStream.toByteArray();
-
- assertEqualsIgnoreNewline(Arrays.toString(oldData ), Arrays.toString(newData));
- }
-
- /**
- * [RESOLVED FIXED] Bug 51671 - HWPFDocument.write based on NPOIFSFileSystem
- * throws a NullPointerException
- */
- @Test
- public void test51671() throws Exception
- {
- InputStream is = POIDataSamples.getDocumentInstance()
- .openResourceAsStream("empty.doc");
- NPOIFSFileSystem npoifsFileSystem = new NPOIFSFileSystem(is);
- try {
- HWPFDocument hwpfDocument = new HWPFDocument(
- npoifsFileSystem.getRoot());
- hwpfDocument.write(new ByteArrayOutputStream());
- hwpfDocument.close();
- } finally {
- npoifsFileSystem.close();
- }
- }
-
- /**
- * Bug 51678 - Extracting text from Bug51524.zip is slow Bug 51524 -
- * PapBinTable constructor is slow
- */
- @Test
- public void test51678And51524() throws IOException
- {
- // YK: the test will run only if the poi.test.remote system property is
- // set.
- // TODO: refactor into something nicer!
- if (System.getProperty("poi.test.remote" ) != null )
- {
- String href = "http://domex.nps.edu/corp/files/govdocs1/007/007488.doc";
- HWPFDocument hwpfDocument = HWPFTestDataSamples
- .openRemoteFile(href);
-
- WordExtractor wordExtractor = new WordExtractor(hwpfDocument);
- try {
- wordExtractor.getText();
- } finally {
- wordExtractor.close();
- }
- }
- }
-
- /**
- * [FIXED] Bug 51902 - Picture.fillRawImageContent -
- * ArrayIndexOutOfBoundsException
- */
- @Test
- public void testBug51890()
- {
- HWPFDocument doc = HWPFTestDataSamples.openSampleFile("Bug51890.doc");
- for (Picture picture : doc.getPicturesTable().getAllPictures() )
- {
- PictureType pictureType = picture.suggestPictureType();
- logger.log(POILogger.DEBUG,
- "Picture at offset " + picture.getStartOffset()
- + " has type " + pictureType);
- }
- }
-
- /**
- * [RESOLVED FIXED] Bug 51834 - Opening and Writing .doc file results in
- * corrupt document
- */
- @Test
- public void testBug51834() throws Exception
- {
- /*
- * we don't have Java test for this file - it should be checked using
- * Microsoft BFF Validator. But check read-write-read anyway. -- sergey
- */
- HWPFTestDataSamples.openSampleFile("Bug51834.doc");
- HWPFTestDataSamples.writeOutAndReadBack(HWPFTestDataSamples
- .openSampleFile("Bug51834.doc"));
- }
-
- /**
- * Bug 51944 - PAPFormattedDiskPage.getPAPX - IndexOutOfBounds
- */
- @Test
- public void testBug51944() throws Exception
- {
- HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug51944.doc");
- assertNotNull(WordToTextConverter.getText(doc));
- }
-
- /**
- * Bug 52032 - [BUG] & [partial-PATCH] HWPF - ArrayIndexOutofBoundsException
- * with no stack trace (broken after revision 1178063)
- */
- @Test
- public void testBug52032_1() throws Exception
- {
- assertNotNull(getText("Bug52032_1.doc"));
- }
-
- /**
- * Bug 52032 - [BUG] & [partial-PATCH] HWPF - ArrayIndexOutofBoundsException
- * with no stack trace (broken after revision 1178063)
- */
- @Test
- public void testBug52032_2() throws Exception
- {
- assertNotNull(getText("Bug52032_2.doc"));
- }
-
- /**
- * Bug 52032 - [BUG] & [partial-PATCH] HWPF - ArrayIndexOutofBoundsException
- * with no stack trace (broken after revision 1178063)
- */
- @Test
- public void testBug52032_3() throws Exception
- {
- assertNotNull(getText("Bug52032_3.doc"));
- }
-
- /**
- * Bug 53380 - ArrayIndexOutOfBounds Exception parsing word 97 document
- */
- @Test
- public void testBug53380_1() throws Exception
- {
- assertNotNull(getText("Bug53380_1.doc"));
- }
-
- /**
- * Bug 53380 - ArrayIndexOutOfBounds Exception parsing word 97 document
- */
- @Test
- public void testBug53380_2() throws Exception
- {
- assertNotNull(getText("Bug53380_2.doc"));
- }
-
- /**
- * Bug 53380 - ArrayIndexOutOfBounds Exception parsing word 97 document
- */
- @Test
- public void testBug53380_3() throws Exception
- {
- assertNotNull(getText("Bug53380_3.doc"));
- }
-
- /**
- * Bug 53380 - ArrayIndexOutOfBounds Exception parsing word 97 document
- */
- @Test
- public void testBug53380_4() throws Exception
- {
- assertNotNull(getText("Bug53380_4.doc"));
- }
-
- /**
- * java.lang.UnsupportedOperationException: Non-extended character
- * Pascal strings are not supported right now
- *
- * Disabled pending a fix for the bug
- */
- @Test
- public void test56880() throws Exception {
- HWPFDocument doc =
- HWPFTestDataSamples.openSampleFile("56880.doc");
- assertEqualsIgnoreNewline("Check Request", doc.getRange().text());
- }
-
-
- // These are the values the are expected to be read when the file
- // is checked.
- private final int section1LeftMargin = 1440;
- private final int section1RightMargin = 1440;
- private final int section1TopMargin = 1440;
- private final int section1BottomMargin = 1440;
- private final int section1NumColumns = 1;
- private int section2LeftMargin = 1440;
- private int section2RightMargin = 1440;
- private int section2TopMargin = 1440;
- private int section2BottomMargin = 1440;
- private final int section2NumColumns = 3;
-
- @Test
- @SuppressWarnings("SuspiciousNameCombination")
- public void testHWPFSections() {
- HWPFDocument document = HWPFTestDataSamples.openSampleFile("Bug53453Section.doc");
- Range overallRange = document.getOverallRange();
- int numParas = overallRange.numParagraphs();
- for(int i = 0; i < numParas; i++) {
- Paragraph para = overallRange.getParagraph(i);
- int numSections = para.numSections();
- for(int j = 0; j < numSections; j++) {
- Section section = para.getSection(j);
- if(para.text().trim().equals("Section1")) {
- assertSection1Margin(section);
- }
- else if(para.text().trim().equals("Section2")) {
- assertSection2Margin(section);
-
- // Change the margin widths
- this.section2BottomMargin = (int)(1.5 * AbstractWordUtils.TWIPS_PER_INCH);
- this.section2TopMargin = (int)(1.75 * AbstractWordUtils.TWIPS_PER_INCH);
- this.section2LeftMargin = (int)(0.5 * AbstractWordUtils.TWIPS_PER_INCH);
- this.section2RightMargin = (int)(0.75 * AbstractWordUtils.TWIPS_PER_INCH);
- section.setMarginBottom(this.section2BottomMargin);
- section.setMarginLeft(this.section2LeftMargin);
- section.setMarginRight(this.section2RightMargin);
- section.setMarginTop(this.section2TopMargin);
- }
- }
- }
-
- // Save away and re-read the document to prove the chages are permanent
- document = HWPFTestDataSamples.writeOutAndReadBack(document);
- overallRange = document.getOverallRange();
- numParas = overallRange.numParagraphs();
- for(int i = 0; i < numParas; i++) {
- Paragraph para = overallRange.getParagraph(i);
- int numSections = para.numSections();
- for(int j = 0; j < numSections; j++) {
- Section section = para.getSection(j);
- if(para.text().trim().equals("Section1")) {
- // No changes to the margins in Section1
- assertSection1Margin(section);
- }
- else if(para.text().trim().equals("Section2")) {
- // The margins in Section2 have kept the new settings.
- assertSection2Margin(section);
- }
- }
- }
- }
-
- @SuppressWarnings("Duplicates")
- private void assertSection1Margin(Section section) {
- assertEquals(section1BottomMargin, section.getMarginBottom());
- assertEquals(section1LeftMargin, section.getMarginLeft());
- assertEquals(section1RightMargin, section.getMarginRight());
- assertEquals(section1TopMargin, section.getMarginTop());
- assertEquals(section1NumColumns, section.getNumColumns());
- }
-
- @SuppressWarnings("Duplicates")
- private void assertSection2Margin(Section section) {
- assertEquals(section2BottomMargin, section.getMarginBottom());
- assertEquals(section2LeftMargin, section.getMarginLeft());
- assertEquals(section2RightMargin, section.getMarginRight());
- assertEquals(section2TopMargin, section.getMarginTop());
- assertEquals(section2NumColumns, section.getNumColumns());
- }
-
- @Test
- public void testRegressionIn315beta2() {
- HWPFDocument hwpfDocument = HWPFTestDataSamples.openSampleFile("cap.stanford.edu_profiles_viewbiosketch_facultyid=4009&name=m_maciver.doc");
- assertNotNull(hwpfDocument);
- }
-
- @Test(expected=ArrayIndexOutOfBoundsException.class)
- public void test57603SevenRowTable() throws Exception {
- HWPFDocument hwpfDocument = HWPFTestDataSamples.openSampleFile("57603-seven_columns.doc");
- HWPFDocument hwpfDocument2 = HWPFTestDataSamples.writeOutAndReadBack(hwpfDocument);
- assertNotNull(hwpfDocument2);
- hwpfDocument2.close();
- hwpfDocument.close();
- }
-
- @Test(expected=ArrayIndexOutOfBoundsException.class)
- public void test57843() throws IOException {
- File f = POIDataSamples.getDocumentInstance().getFile("57843.doc");
- POIFSFileSystem fs = new POIFSFileSystem(f, true);
- try {
- HWPFOldDocument doc = new HWPFOldDocument(fs);
- assertNotNull(doc);
- doc.close();
- } finally {
- fs.close();
- }
- }
-
- @Test
- public void testCommonCrawlRegression() throws IOException {
- HWPFDocument document = HWPFTestDataSamples.openSampleFile("ca.kwsymphony.www_education_School_Concert_Seat_Booking_Form_2011-12.doc");
- document.close();
- }
- }
|