You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

UnicodeString.java 16KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516
  1. /* ====================================================================
  2. Licensed to the Apache Software Foundation (ASF) under one or more
  3. contributor license agreements. See the NOTICE file distributed with
  4. this work for additional information regarding copyright ownership.
  5. The ASF licenses this file to You under the Apache License, Version 2.0
  6. (the "License"); you may not use this file except in compliance with
  7. the License. You may obtain a copy of the License at
  8. http://www.apache.org/licenses/LICENSE-2.0
  9. Unless required by applicable law or agreed to in writing, software
  10. distributed under the License is distributed on an "AS IS" BASIS,
  11. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. See the License for the specific language governing permissions and
  13. limitations under the License.
  14. ==================================================================== */
  15. package org.apache.poi.hssf.record.common;
  16. import java.util.ArrayList;
  17. import java.util.Collections;
  18. import java.util.Iterator;
  19. import java.util.List;
  20. import java.util.Map;
  21. import java.util.Objects;
  22. import java.util.function.Supplier;
  23. import java.util.stream.Collectors;
  24. import org.apache.logging.log4j.LogManager;
  25. import org.apache.logging.log4j.Logger;
  26. import org.apache.poi.common.Duplicatable;
  27. import org.apache.poi.common.usermodel.GenericRecord;
  28. import org.apache.poi.hssf.record.RecordInputStream;
  29. import org.apache.poi.hssf.record.cont.ContinuableRecordInput;
  30. import org.apache.poi.hssf.record.cont.ContinuableRecordOutput;
  31. import org.apache.poi.util.BitField;
  32. import org.apache.poi.util.BitFieldFactory;
  33. import org.apache.poi.util.GenericRecordUtil;
  34. import static org.apache.logging.log4j.util.Unbox.box;
  35. /**
  36. * Unicode String - just standard fields that are in several records.
  37. * It is considered more desirable then repeating it in all of them.<p>
  38. * This is often called a XLUnicodeRichExtendedString in MS documentation.<p>
  39. */
  40. public class UnicodeString implements Comparable<UnicodeString>, Duplicatable, GenericRecord {
  41. private static final Logger LOG = LogManager.getLogger(UnicodeString.class);
  42. private static final BitField highByte = BitFieldFactory.getInstance(0x1);
  43. // 0x2 is reserved
  44. private static final BitField extBit = BitFieldFactory.getInstance(0x4);
  45. private static final BitField richText = BitFieldFactory.getInstance(0x8);
  46. private short field_1_charCount;
  47. private byte field_2_optionflags;
  48. private String field_3_string;
  49. private List<FormatRun> field_4_format_runs;
  50. private ExtRst field_5_ext_rst;
  51. private UnicodeString(UnicodeString other) {
  52. field_1_charCount = other.field_1_charCount;
  53. field_2_optionflags = other.field_2_optionflags;
  54. field_3_string = other.field_3_string;
  55. field_4_format_runs = (other.field_4_format_runs == null) ? null :
  56. other.field_4_format_runs.stream().map(FormatRun::new).collect(Collectors.toList());
  57. field_5_ext_rst = (other.field_5_ext_rst == null) ? null : other.field_5_ext_rst.copy();
  58. }
  59. public UnicodeString(String str) {
  60. setString(str);
  61. }
  62. /**
  63. * construct a unicode string record and fill its fields, ID is ignored
  64. * @param in the RecordInputstream to read the record from
  65. */
  66. public UnicodeString(RecordInputStream in) {
  67. field_1_charCount = in.readShort();
  68. field_2_optionflags = in.readByte();
  69. int runCount = 0;
  70. int extensionLength = 0;
  71. //Read the number of rich runs if rich text.
  72. if (isRichText()) {
  73. runCount = in.readShort();
  74. }
  75. //Read the size of extended data if present.
  76. if (isExtendedText()) {
  77. extensionLength = in.readInt();
  78. }
  79. boolean isCompressed = ((field_2_optionflags & 1) == 0);
  80. int cc = getCharCount();
  81. field_3_string = (isCompressed) ? in.readCompressedUnicode(cc) : in.readUnicodeLEString(cc);
  82. if (isRichText() && (runCount > 0)) {
  83. field_4_format_runs = new ArrayList<>(runCount);
  84. for (int i=0;i<runCount;i++) {
  85. field_4_format_runs.add(new FormatRun(in));
  86. }
  87. }
  88. if (isExtendedText() && (extensionLength > 0)) {
  89. field_5_ext_rst = new ExtRst(new ContinuableRecordInput(in), extensionLength);
  90. if(field_5_ext_rst.getDataSize()+4 != extensionLength) {
  91. LOG.atWarn().log("ExtRst was supposed to be {} bytes long, but seems to actually be {}", box(extensionLength),box(field_5_ext_rst.getDataSize() + 4));
  92. }
  93. }
  94. }
  95. public int hashCode() {
  96. return Objects.hash(field_1_charCount, field_3_string);
  97. }
  98. /**
  99. * Our handling of equals is inconsistent with compareTo. The trouble is because we don't truely understand
  100. * rich text fields yet it's difficult to make a sound comparison.
  101. *
  102. * @param o The object to compare.
  103. * @return true if the object is actually equal.
  104. */
  105. public boolean equals(Object o)
  106. {
  107. if (!(o instanceof UnicodeString)) {
  108. return false;
  109. }
  110. UnicodeString other = (UnicodeString) o;
  111. //OK lets do this in stages to return a quickly, first check the actual string
  112. if (field_1_charCount != other.field_1_charCount
  113. || field_2_optionflags != other.field_2_optionflags
  114. || !field_3_string.equals(other.field_3_string)) {
  115. return false;
  116. }
  117. //OK string appears to be equal but now lets compare formatting runs
  118. if (field_4_format_runs == null) {
  119. // Strings are equal, and there are not formatting runs.
  120. return (other.field_4_format_runs == null);
  121. } else if (other.field_4_format_runs == null) {
  122. // Strings are equal, but one or the other has formatting runs
  123. return false;
  124. }
  125. //Strings are equal, so now compare formatting runs.
  126. int size = field_4_format_runs.size();
  127. if (size != other.field_4_format_runs.size()) {
  128. return false;
  129. }
  130. for (int i=0;i<size;i++) {
  131. FormatRun run1 = field_4_format_runs.get(i);
  132. FormatRun run2 = other.field_4_format_runs.get(i);
  133. if (!run1.equals(run2)) {
  134. return false;
  135. }
  136. }
  137. // Well the format runs are equal as well!, better check the ExtRst data
  138. if (field_5_ext_rst == null) {
  139. return (other.field_5_ext_rst == null);
  140. } else if (other.field_5_ext_rst == null) {
  141. return false;
  142. }
  143. return field_5_ext_rst.equals(other.field_5_ext_rst);
  144. }
  145. /**
  146. * get the number of characters in the string,
  147. * as an un-wrapped int
  148. *
  149. * @return number of characters
  150. */
  151. public int getCharCount() {
  152. if(field_1_charCount < 0) {
  153. return field_1_charCount + 65536;
  154. }
  155. return field_1_charCount;
  156. }
  157. /**
  158. * get the number of characters in the string,
  159. * wrapped as needed to fit within a short
  160. *
  161. * @return number of characters
  162. */
  163. public short getCharCountShort() {
  164. return field_1_charCount;
  165. }
  166. /**
  167. * set the number of characters in the string
  168. * @param cc - number of characters
  169. */
  170. public void setCharCount(short cc)
  171. {
  172. field_1_charCount = cc;
  173. }
  174. /**
  175. * get the option flags which among other things return if this is a 16-bit or
  176. * 8 bit string
  177. *
  178. * @return optionflags bitmask
  179. *
  180. */
  181. public byte getOptionFlags()
  182. {
  183. return field_2_optionflags;
  184. }
  185. /**
  186. * set the option flags which among other things return if this is a 16-bit or
  187. * 8 bit string
  188. *
  189. * @param of optionflags bitmask
  190. *
  191. */
  192. public void setOptionFlags(byte of)
  193. {
  194. field_2_optionflags = of;
  195. }
  196. /**
  197. * @return the actual string this contains as a java String object
  198. */
  199. public String getString()
  200. {
  201. return field_3_string;
  202. }
  203. /**
  204. * set the actual string this contains
  205. * @param string the text
  206. */
  207. public void setString(String string)
  208. {
  209. field_3_string = string;
  210. setCharCount((short)field_3_string.length());
  211. // scan for characters greater than 255 ... if any are
  212. // present, we have to use 16-bit encoding. Otherwise, we
  213. // can use 8-bit encoding
  214. boolean useUTF16 = false;
  215. int strlen = string.length();
  216. for ( int j = 0; j < strlen; j++ ) {
  217. if ( string.charAt( j ) > 255 ) {
  218. useUTF16 = true;
  219. break;
  220. }
  221. }
  222. if (useUTF16) {
  223. //Set the uncompressed bit
  224. field_2_optionflags = highByte.setByte(field_2_optionflags);
  225. } else {
  226. field_2_optionflags = highByte.clearByte(field_2_optionflags);
  227. }
  228. }
  229. public int getFormatRunCount() {
  230. return (field_4_format_runs == null) ? 0 : field_4_format_runs.size();
  231. }
  232. public FormatRun getFormatRun(int index) {
  233. if (field_4_format_runs == null) {
  234. return null;
  235. }
  236. if (index < 0 || index >= field_4_format_runs.size()) {
  237. return null;
  238. }
  239. return field_4_format_runs.get(index);
  240. }
  241. private int findFormatRunAt(int characterPos) {
  242. int size = field_4_format_runs.size();
  243. for (int i=0;i<size;i++) {
  244. FormatRun r = field_4_format_runs.get(i);
  245. if (r._character == characterPos) {
  246. return i;
  247. } else if (r._character > characterPos) {
  248. return -1;
  249. }
  250. }
  251. return -1;
  252. }
  253. /** Adds a font run to the formatted string.
  254. *
  255. * If a font run exists at the current charcter location, then it is
  256. * replaced with the font run to be added.
  257. */
  258. public void addFormatRun(FormatRun r) {
  259. if (field_4_format_runs == null) {
  260. field_4_format_runs = new ArrayList<>();
  261. }
  262. int index = findFormatRunAt(r._character);
  263. if (index != -1) {
  264. field_4_format_runs.remove(index);
  265. }
  266. field_4_format_runs.add(r);
  267. //Need to sort the font runs to ensure that the font runs appear in
  268. //character order
  269. Collections.sort(field_4_format_runs);
  270. //Make sure that we now say that we are a rich string
  271. field_2_optionflags = richText.setByte(field_2_optionflags);
  272. }
  273. public Iterator<FormatRun> formatIterator() {
  274. if (field_4_format_runs != null) {
  275. return field_4_format_runs.iterator();
  276. }
  277. return null;
  278. }
  279. public void removeFormatRun(FormatRun r) {
  280. field_4_format_runs.remove(r);
  281. if (field_4_format_runs.size() == 0) {
  282. field_4_format_runs = null;
  283. field_2_optionflags = richText.clearByte(field_2_optionflags);
  284. }
  285. }
  286. public void clearFormatting() {
  287. field_4_format_runs = null;
  288. field_2_optionflags = richText.clearByte(field_2_optionflags);
  289. }
  290. public ExtRst getExtendedRst() {
  291. return this.field_5_ext_rst;
  292. }
  293. void setExtendedRst(ExtRst ext_rst) {
  294. if (ext_rst != null) {
  295. field_2_optionflags = extBit.setByte(field_2_optionflags);
  296. } else {
  297. field_2_optionflags = extBit.clearByte(field_2_optionflags);
  298. }
  299. this.field_5_ext_rst = ext_rst;
  300. }
  301. /**
  302. * Swaps all use in the string of one font index
  303. * for use of a different font index.
  304. * Normally only called when fonts have been
  305. * removed / re-ordered
  306. */
  307. public void swapFontUse(short oldFontIndex, short newFontIndex) {
  308. if (field_4_format_runs != null) {
  309. for (FormatRun run : field_4_format_runs) {
  310. if(run._fontIndex == oldFontIndex) {
  311. run._fontIndex = newFontIndex;
  312. }
  313. }
  314. }
  315. }
  316. /**
  317. * unlike the real records we return the same as "getString()" rather than debug info
  318. * @see #getDebugInfo()
  319. * @return String value of the record
  320. */
  321. public String toString()
  322. {
  323. return getString();
  324. }
  325. /**
  326. * return a character representation of the fields of this record
  327. *
  328. *
  329. * @return String of output for biffviewer etc.
  330. *
  331. */
  332. public String getDebugInfo()
  333. {
  334. StringBuilder buffer = new StringBuilder();
  335. buffer.append("[UNICODESTRING]\n");
  336. buffer.append(" .charcount = ")
  337. .append(Integer.toHexString(getCharCount())).append("\n");
  338. buffer.append(" .optionflags = ")
  339. .append(Integer.toHexString(getOptionFlags())).append("\n");
  340. buffer.append(" .string = ").append(getString()).append("\n");
  341. if (field_4_format_runs != null) {
  342. for (int i = 0; i < field_4_format_runs.size();i++) {
  343. FormatRun r = field_4_format_runs.get(i);
  344. buffer.append(" .format_run").append(i).append(" = ").append(r).append("\n");
  345. }
  346. }
  347. if (field_5_ext_rst != null) {
  348. buffer.append(" .field_5_ext_rst = ").append("\n");
  349. buffer.append(field_5_ext_rst).append("\n");
  350. }
  351. buffer.append("[/UNICODESTRING]\n");
  352. return buffer.toString();
  353. }
  354. /**
  355. * Serialises out the String. There are special rules
  356. * about where we can and can't split onto
  357. * Continue records.
  358. */
  359. public void serialize(ContinuableRecordOutput out) {
  360. int numberOfRichTextRuns = 0;
  361. int extendedDataSize = 0;
  362. if (isRichText() && field_4_format_runs != null) {
  363. numberOfRichTextRuns = field_4_format_runs.size();
  364. }
  365. if (isExtendedText() && field_5_ext_rst != null) {
  366. extendedDataSize = 4 + field_5_ext_rst.getDataSize();
  367. }
  368. // Serialise the bulk of the String
  369. // The writeString handles tricky continue stuff for us
  370. out.writeString(field_3_string, numberOfRichTextRuns, extendedDataSize);
  371. if (numberOfRichTextRuns > 0) {
  372. //This will ensure that a run does not split a continue
  373. for (int i=0;i<numberOfRichTextRuns;i++) {
  374. if (out.getAvailableSpace() < 4) {
  375. out.writeContinue();
  376. }
  377. FormatRun r = field_4_format_runs.get(i);
  378. r.serialize(out);
  379. }
  380. }
  381. if (extendedDataSize > 0 && field_5_ext_rst != null) {
  382. field_5_ext_rst.serialize(out);
  383. }
  384. }
  385. public int compareTo(UnicodeString str) {
  386. int result = getString().compareTo(str.getString());
  387. //As per the equals method lets do this in stages
  388. if (result != 0) {
  389. return result;
  390. }
  391. //OK string appears to be equal but now lets compare formatting runs
  392. if (field_4_format_runs == null) {
  393. //Strings are equal, and there are no formatting runs. -> 0
  394. //Strings are equal, but one or the other has formatting runs -> 1
  395. return (str.field_4_format_runs == null) ? 0 : 1;
  396. } else if (str.field_4_format_runs == null) {
  397. //Strings are equal, but one or the other has formatting runs
  398. return -1;
  399. }
  400. //Strings are equal, so now compare formatting runs.
  401. int size = field_4_format_runs.size();
  402. if (size != str.field_4_format_runs.size()) {
  403. return size - str.field_4_format_runs.size();
  404. }
  405. for (int i=0;i<size;i++) {
  406. FormatRun run1 = field_4_format_runs.get(i);
  407. FormatRun run2 = str.field_4_format_runs.get(i);
  408. result = run1.compareTo(run2);
  409. if (result != 0) {
  410. return result;
  411. }
  412. }
  413. //Well the format runs are equal as well!, better check the ExtRst data
  414. if (field_5_ext_rst == null) {
  415. return (str.field_5_ext_rst == null) ? 0 : 1;
  416. } else if (str.field_5_ext_rst == null) {
  417. return -1;
  418. } else {
  419. return field_5_ext_rst.compareTo(str.field_5_ext_rst);
  420. }
  421. }
  422. private boolean isRichText() {
  423. return richText.isSet(getOptionFlags());
  424. }
  425. private boolean isExtendedText() {
  426. return extBit.isSet(getOptionFlags());
  427. }
  428. @Override
  429. public UnicodeString copy() {
  430. return new UnicodeString(this);
  431. }
  432. @Override
  433. public Map<String, Supplier<?>> getGenericProperties() {
  434. return GenericRecordUtil.getGenericProperties(
  435. "charCount", this::getCharCount,
  436. "optionFlags", this::getOptionFlags,
  437. "string", this::getString,
  438. "formatRuns", () -> field_4_format_runs,
  439. "extendedRst", this::getExtendedRst
  440. );
  441. }
  442. }