You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

TestHWPFOldDocument.java 9.2KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234
  1. /* ====================================================================
  2. Licensed to the Apache Software Foundation (ASF) under one or more
  3. contributor license agreements. See the NOTICE file distributed with
  4. this work for additional information regarding copyright ownership.
  5. The ASF licenses this file to You under the Apache License, Version 2.0
  6. (the "License"); you may not use this file except in compliance with
  7. the License. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. ==================================================================== */
  15. package org.apache.poi.hwpf.usermodel;
  16. import static org.apache.poi.POITestCase.assertContains;
  17. import static org.junit.Assert.assertEquals;
  18. import java.io.IOException;
  19. import java.nio.charset.Charset;
  20. import org.apache.poi.OldFileFormatException;
  21. import org.apache.poi.hwmf.record.HwmfFont;
  22. import org.apache.poi.hwpf.HWPFOldDocument;
  23. import org.apache.poi.hwpf.HWPFTestCase;
  24. import org.apache.poi.hwpf.HWPFTestDataSamples;
  25. import org.apache.poi.hwpf.extractor.Word6Extractor;
  26. import org.apache.poi.hwpf.model.OldFontTable;
  27. import org.junit.Test;
  28. /**
  29. * Tests for Word 6 and Word 95 support
  30. */
  31. public final class TestHWPFOldDocument extends HWPFTestCase {
  32. /**
  33. * Test a simple Word 6 document
  34. */
  35. @Test(expected=OldFileFormatException.class)
  36. public void testWord6hwpf() throws IOException {
  37. // Can't open as HWPFDocument
  38. HWPFTestDataSamples.openSampleFile("Word6.doc");
  39. }
  40. @Test
  41. public void testWord6hwpfOld() throws IOException {
  42. // Open
  43. HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Word6.doc");
  44. // Check
  45. assertEquals(1, doc.getRange().numSections());
  46. assertEquals(1, doc.getRange().numParagraphs());
  47. assertEquals(1, doc.getRange().numCharacterRuns());
  48. assertEquals("The quick brown fox jumps over the lazy dog\r",
  49. doc.getRange().getParagraph(0).text());
  50. doc.close();
  51. }
  52. /**
  53. * Test a simple Word 95 document
  54. */
  55. @Test(expected=OldFileFormatException.class)
  56. public void testWord95hwpf() throws IOException {
  57. // Can't open as HWPFDocument
  58. HWPFTestDataSamples.openSampleFile("Word95.doc");
  59. }
  60. @Test
  61. public void testWord95hwpfOld() throws IOException {
  62. // Open
  63. HWPFOldDocument doc = HWPFTestDataSamples
  64. .openOldSampleFile("Word95.doc");
  65. // Check
  66. assertEquals(1, doc.getRange().numSections());
  67. assertEquals(7, doc.getRange().numParagraphs());
  68. assertEquals("The quick brown fox jumps over the lazy dog\r",
  69. doc.getRange().getParagraph(0).text());
  70. assertEquals("\r", doc.getRange().getParagraph(1).text());
  71. assertEquals("Paragraph 2\r", doc.getRange().getParagraph(2).text());
  72. assertEquals("\r", doc.getRange().getParagraph(3).text());
  73. assertEquals(
  74. "Paragraph 3. Has some RED text and some "
  75. + "BLUE BOLD text in it.\r",
  76. doc.getRange().getParagraph(4).text());
  77. assertEquals("\r", doc.getRange().getParagraph(5).text());
  78. assertEquals("Last (4th) paragraph.\r",
  79. doc.getRange().getParagraph(6).text());
  80. assertEquals(1, doc.getRange().getParagraph(0).numCharacterRuns());
  81. assertEquals(1, doc.getRange().getParagraph(1).numCharacterRuns());
  82. assertEquals(1, doc.getRange().getParagraph(2).numCharacterRuns());
  83. assertEquals(1, doc.getRange().getParagraph(3).numCharacterRuns());
  84. // Normal, red, normal, blue+bold, normal
  85. assertEquals(5, doc.getRange().getParagraph(4).numCharacterRuns());
  86. assertEquals(1, doc.getRange().getParagraph(5).numCharacterRuns());
  87. // Normal, superscript for 4th, normal
  88. assertEquals(3, doc.getRange().getParagraph(6).numCharacterRuns());
  89. doc.close();
  90. }
  91. /**
  92. * Test a word document that has sections, as well as the usual paragraph
  93. * stuff.
  94. */
  95. @Test
  96. public void testWord6Sections() throws IOException {
  97. HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Word6_sections.doc");
  98. assertEquals(3, doc.getRange().numSections());
  99. assertEquals(6, doc.getRange().numParagraphs());
  100. assertEquals("This is a test.\r",
  101. doc.getRange().getParagraph(0).text());
  102. assertEquals("\r", doc.getRange().getParagraph(1).text());
  103. // Section / line?
  104. assertEquals("\u000c", doc.getRange().getParagraph(2).text());
  105. assertEquals("This is a new section.\r",
  106. doc.getRange().getParagraph(3).text());
  107. // Section / line?
  108. assertEquals("\u000c", doc.getRange().getParagraph(4).text());
  109. assertEquals("\r", doc.getRange().getParagraph(5).text());
  110. doc.close();
  111. }
  112. /**
  113. * Another word document with sections, this time with a few more section
  114. * properties set on it
  115. */
  116. @Test
  117. public void testWord6Sections2() throws IOException {
  118. HWPFOldDocument doc = HWPFTestDataSamples
  119. .openOldSampleFile("Word6_sections2.doc");
  120. assertEquals(1, doc.getRange().numSections());
  121. assertEquals(57, doc.getRange().numParagraphs());
  122. assertEquals("\r", doc.getRange().getParagraph(0).text());
  123. assertEquals("STATEMENT OF INSOLVENCY PRACTICE 10 (SCOTLAND)\r",
  124. doc.getRange().getParagraph(1).text());
  125. doc.close();
  126. }
  127. @Test
  128. public void testDefaultCodePageEncoding() throws IOException {
  129. HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug60942.doc");
  130. Word6Extractor ex = new Word6Extractor(doc);
  131. String txt = ex.getText();
  132. assertContains(txt, "BERTHOD");
  133. assertContains(txt, "APPLICOLOR");
  134. assertContains(txt, "les meilleurs");
  135. assertContains(txt, "GUY LECOLE");
  136. }
  137. @Test
  138. public void testCodePageBug50955() throws IOException {
  139. //windows 1251
  140. HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug50955.doc");
  141. Word6Extractor ex = new Word6Extractor(doc);
  142. StringBuilder sb = new StringBuilder();
  143. for (String p : ex.getParagraphText()) {
  144. sb.append(p);
  145. }
  146. assertContains(sb.toString(), "\u043F\u0440\u0438\u0432\u0435\u0442");//Greetings!
  147. }
  148. @Test
  149. public void testCodePageBug60936() throws IOException {
  150. //windows 1250 -- this test file was generated with OpenOffice
  151. //see https://bz.apache.org/ooo/show_bug.cgi?id=12445 for the inspiration
  152. HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug60936.doc");
  153. Word6Extractor ex = new Word6Extractor(doc);
  154. StringBuilder sb = new StringBuilder();
  155. for (String p : ex.getParagraphText()) {
  156. sb.append(p);
  157. }
  158. assertContains(sb.toString(), "4 sk\u00f3re a p\u0159ed 7 lety");//Greetings!
  159. }
  160. @Test
  161. public void testOldFontTableEncoding() throws IOException {
  162. HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug51944.doc");
  163. OldFontTable oldFontTable = doc.getOldFontTable();
  164. assertEquals(5, oldFontTable.getFontNames().length);
  165. assertEquals("\u7D30\u660E\u9AD4", oldFontTable.getFontNames()[0].getMainFontName());
  166. assertEquals(HwmfFont.WmfCharset.CHINESEBIG5_CHARSET.getCharset(), Charset.forName("Big5"));
  167. assertEquals("Times New Roman", oldFontTable.getFontNames()[1].getMainFontName());
  168. doc.close();
  169. }
  170. @Test
  171. public void testOldFontTableAltName() throws IOException {
  172. HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug60942b.doc");
  173. OldFontTable oldFontTable = doc.getOldFontTable();
  174. assertEquals(5, oldFontTable.getFontNames().length);
  175. assertEquals("Roboto", oldFontTable.getFontNames()[3].getMainFontName());
  176. assertEquals("arial", oldFontTable.getFontNames()[3].getAltFontName());
  177. assertEquals("Roboto", oldFontTable.getFontNames()[4].getMainFontName());
  178. assertEquals("arial", oldFontTable.getFontNames()[4].getAltFontName());
  179. }
  180. @Test
  181. public void test51944() throws IOException {
  182. HWPFOldDocument doc = HWPFTestDataSamples.openOldSampleFile("Bug51944.doc");
  183. Word6Extractor ex = new Word6Extractor(doc);
  184. StringBuilder sb = new StringBuilder();
  185. for (String p : ex.getParagraphText()) {
  186. sb.append(p.replaceAll("[\r\n]+", "\n"));
  187. }
  188. String txt = sb.toString();
  189. assertContains(txt, "Post and Fax");
  190. assertContains(txt, "also maintain");//this is at a critical juncture
  191. assertContains(txt, "which are available for");//this too
  192. //TODO: figure out why these two aren't passing
  193. // assertContains(txt, "\u2019\u0078 block2");//make sure smart quote is extracted correctly
  194. // assertContains(txt, "We are able to");//not sure if we can get this easily?
  195. }
  196. }