123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234 |
- /* ====================================================================
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements. See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License. You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- ==================================================================== */
-
- package org.apache.poi.hwpf.usermodel;
-
- import static org.apache.poi.POITestCase.assertContains;
- import static org.junit.Assert.assertEquals;
-
- import java.io.IOException;
- import java.nio.charset.Charset;
-
- import org.apache.poi.OldFileFormatException;
- import org.apache.poi.hwmf.record.HwmfFont;
- import org.apache.poi.hwpf.HWPFOldDocument;
- import org.apache.poi.hwpf.HWPFTestCase;
- import org.apache.poi.hwpf.HWPFTestDataSamples;
- import org.apache.poi.hwpf.extractor.Word6Extractor;
- import org.apache.poi.hwpf.model.OldFontTable;
- import org.junit.Test;
-
- /**
- * Tests for Word 6 and Word 95 support
- */
- public final class TestHWPFOldDocument extends HWPFTestCase {
- /**
- * Test a simple Word 6 document
- */
- @Test(expected=OldFileFormatException.class)
- public void testWord6hwpf() throws IOException {
- // Can't open as HWPFDocument
- HWPFTestDataSamples.openSampleFile("Word6.doc");
- }
-
- @Test
- public void testWord6hwpfOld() throws IOException {
- // Open
- HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Word6.doc");
-
- // Check
- assertEquals(1, doc.getRange().numSections());
- assertEquals(1, doc.getRange().numParagraphs());
- assertEquals(1, doc.getRange().numCharacterRuns());
-
- assertEquals("The quick brown fox jumps over the lazy dog\r",
- doc.getRange().getParagraph(0).text());
- doc.close();
- }
-
-
-
- /**
- * Test a simple Word 95 document
- */
- @Test(expected=OldFileFormatException.class)
- public void testWord95hwpf() throws IOException {
- // Can't open as HWPFDocument
- HWPFTestDataSamples.openSampleFile("Word95.doc");
- }
-
- @Test
- public void testWord95hwpfOld() throws IOException {
- // Open
- HWPFOldDocument doc = HWPFTestDataSamples
- .openOldSampleFile("Word95.doc");
-
- // Check
- assertEquals(1, doc.getRange().numSections());
- assertEquals(7, doc.getRange().numParagraphs());
-
- assertEquals("The quick brown fox jumps over the lazy dog\r",
- doc.getRange().getParagraph(0).text());
- assertEquals("\r", doc.getRange().getParagraph(1).text());
- assertEquals("Paragraph 2\r", doc.getRange().getParagraph(2).text());
- assertEquals("\r", doc.getRange().getParagraph(3).text());
- assertEquals(
- "Paragraph 3. Has some RED text and some "
- + "BLUE BOLD text in it.\r",
- doc.getRange().getParagraph(4).text());
- assertEquals("\r", doc.getRange().getParagraph(5).text());
- assertEquals("Last (4th) paragraph.\r",
- doc.getRange().getParagraph(6).text());
-
- assertEquals(1, doc.getRange().getParagraph(0).numCharacterRuns());
- assertEquals(1, doc.getRange().getParagraph(1).numCharacterRuns());
- assertEquals(1, doc.getRange().getParagraph(2).numCharacterRuns());
- assertEquals(1, doc.getRange().getParagraph(3).numCharacterRuns());
- // Normal, red, normal, blue+bold, normal
- assertEquals(5, doc.getRange().getParagraph(4).numCharacterRuns());
- assertEquals(1, doc.getRange().getParagraph(5).numCharacterRuns());
- // Normal, superscript for 4th, normal
- assertEquals(3, doc.getRange().getParagraph(6).numCharacterRuns());
-
- doc.close();
- }
-
- /**
- * Test a word document that has sections, as well as the usual paragraph
- * stuff.
- */
- @Test
- public void testWord6Sections() throws IOException {
- HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Word6_sections.doc");
-
- assertEquals(3, doc.getRange().numSections());
- assertEquals(6, doc.getRange().numParagraphs());
-
- assertEquals("This is a test.\r",
- doc.getRange().getParagraph(0).text());
- assertEquals("\r", doc.getRange().getParagraph(1).text());
- // Section / line?
- assertEquals("\u000c", doc.getRange().getParagraph(2).text());
- assertEquals("This is a new section.\r",
- doc.getRange().getParagraph(3).text());
- // Section / line?
- assertEquals("\u000c", doc.getRange().getParagraph(4).text());
- assertEquals("\r", doc.getRange().getParagraph(5).text());
- doc.close();
- }
-
- /**
- * Another word document with sections, this time with a few more section
- * properties set on it
- */
- @Test
- public void testWord6Sections2() throws IOException {
- HWPFOldDocument doc = HWPFTestDataSamples
- .openOldSampleFile("Word6_sections2.doc");
-
- assertEquals(1, doc.getRange().numSections());
- assertEquals(57, doc.getRange().numParagraphs());
-
- assertEquals("\r", doc.getRange().getParagraph(0).text());
- assertEquals("STATEMENT OF INSOLVENCY PRACTICE 10 (SCOTLAND)\r",
- doc.getRange().getParagraph(1).text());
- doc.close();
- }
-
- @Test
- public void testDefaultCodePageEncoding() throws IOException {
- HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug60942.doc");
- Word6Extractor ex = new Word6Extractor(doc);
- String txt = ex.getText();
- assertContains(txt, "BERTHOD");
- assertContains(txt, "APPLICOLOR");
- assertContains(txt, "les meilleurs");
- assertContains(txt, "GUY LECOLE");
- }
-
-
- @Test
- public void testCodePageBug50955() throws IOException {
- //windows 1251
- HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug50955.doc");
- Word6Extractor ex = new Word6Extractor(doc);
-
- StringBuilder sb = new StringBuilder();
- for (String p : ex.getParagraphText()) {
- sb.append(p);
- }
- assertContains(sb.toString(), "\u043F\u0440\u0438\u0432\u0435\u0442");//Greetings!
- }
-
- @Test
- public void testCodePageBug60936() throws IOException {
- //windows 1250 -- this test file was generated with OpenOffice
- //see https://bz.apache.org/ooo/show_bug.cgi?id=12445 for the inspiration
-
-
- HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug60936.doc");
- Word6Extractor ex = new Word6Extractor(doc);
- StringBuilder sb = new StringBuilder();
- for (String p : ex.getParagraphText()) {
- sb.append(p);
- }
- assertContains(sb.toString(), "4 sk\u00f3re a p\u0159ed 7 lety");//Greetings!
- }
-
- @Test
- public void testOldFontTableEncoding() throws IOException {
- HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug51944.doc");
- OldFontTable oldFontTable = doc.getOldFontTable();
- assertEquals(5, oldFontTable.getFontNames().length);
- assertEquals("\u7D30\u660E\u9AD4", oldFontTable.getFontNames()[0].getMainFontName());
- assertEquals(HwmfFont.WmfCharset.CHINESEBIG5_CHARSET.getCharset(), Charset.forName("Big5"));
- assertEquals("Times New Roman", oldFontTable.getFontNames()[1].getMainFontName());
- doc.close();
-
- }
-
- @Test
- public void testOldFontTableAltName() throws IOException {
- HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug60942b.doc");
- OldFontTable oldFontTable = doc.getOldFontTable();
- assertEquals(5, oldFontTable.getFontNames().length);
- assertEquals("Roboto", oldFontTable.getFontNames()[3].getMainFontName());
- assertEquals("arial", oldFontTable.getFontNames()[3].getAltFontName());
- assertEquals("Roboto", oldFontTable.getFontNames()[4].getMainFontName());
- assertEquals("arial", oldFontTable.getFontNames()[4].getAltFontName());
- }
-
-
- @Test
- public void test51944() throws IOException {
- HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug51944.doc");
- Word6Extractor ex = new Word6Extractor(doc);
- StringBuilder sb = new StringBuilder();
- for (String p : ex.getParagraphText()) {
- sb.append(p.replaceAll("[\r\n]+", "\n"));
- }
- String txt = sb.toString();
- assertContains(txt, "Post and Fax");
- assertContains(txt, "also maintain");//this is at a critical juncture
- assertContains(txt, "which are available for");//this too
-
- //TODO: figure out why these two aren't passing
- // assertContains(txt, "\u2019\u0078 block2");//make sure smart quote is extracted correctly
- // assertContains(txt, "We are able to");//not sure if we can get this easily?
- }
-
- }
|