You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

RawParseUtils.java 33KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101
  1. /*
  2. * Copyright (C) 2008-2009, Google Inc.
  3. * Copyright (C) 2006-2008, Shawn O. Pearce <spearce@spearce.org>
  4. * and other copyright owners as documented in the project's IP log.
  5. *
  6. * This program and the accompanying materials are made available
  7. * under the terms of the Eclipse Distribution License v1.0 which
  8. * accompanies this distribution, is reproduced below, and is
  9. * available at http://www.eclipse.org/org/documents/edl-v10.php
  10. *
  11. * All rights reserved.
  12. *
  13. * Redistribution and use in source and binary forms, with or
  14. * without modification, are permitted provided that the following
  15. * conditions are met:
  16. *
  17. * - Redistributions of source code must retain the above copyright
  18. * notice, this list of conditions and the following disclaimer.
  19. *
  20. * - Redistributions in binary form must reproduce the above
  21. * copyright notice, this list of conditions and the following
  22. * disclaimer in the documentation and/or other materials provided
  23. * with the distribution.
  24. *
  25. * - Neither the name of the Eclipse Foundation, Inc. nor the
  26. * names of its contributors may be used to endorse or promote
  27. * products derived from this software without specific prior
  28. * written permission.
  29. *
  30. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
  31. * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
  32. * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  33. * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  34. * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
  35. * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  36. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  37. * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  38. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  39. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  40. * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  41. * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  42. * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  43. */
  44. package org.eclipse.jgit.util;
  45. import static org.eclipse.jgit.lib.ObjectChecker.author;
  46. import static org.eclipse.jgit.lib.ObjectChecker.committer;
  47. import static org.eclipse.jgit.lib.ObjectChecker.encoding;
  48. import static org.eclipse.jgit.lib.ObjectChecker.tagger;
  49. import java.nio.ByteBuffer;
  50. import java.nio.charset.CharacterCodingException;
  51. import java.nio.charset.Charset;
  52. import java.nio.charset.CharsetDecoder;
  53. import java.nio.charset.CodingErrorAction;
  54. import java.nio.charset.IllegalCharsetNameException;
  55. import java.nio.charset.UnsupportedCharsetException;
  56. import java.util.Arrays;
  57. import java.util.HashMap;
  58. import java.util.Map;
  59. import org.eclipse.jgit.lib.Constants;
  60. import org.eclipse.jgit.lib.PersonIdent;
  61. /** Handy utility functions to parse raw object contents. */
  62. public final class RawParseUtils {
  63. /**
  64. * UTF-8 charset constant.
  65. *
  66. * @since 2.2
  67. */
  68. public static final Charset UTF8_CHARSET = Charset.forName("UTF-8"); //$NON-NLS-1$
  69. private static final byte[] digits10;
  70. private static final byte[] digits16;
  71. private static final byte[] footerLineKeyChars;
  72. private static final Map<String, Charset> encodingAliases;
  73. static {
  74. encodingAliases = new HashMap<String, Charset>();
  75. encodingAliases.put("latin-1", Charset.forName("ISO-8859-1")); //$NON-NLS-1$ //$NON-NLS-2$
  76. digits10 = new byte['9' + 1];
  77. Arrays.fill(digits10, (byte) -1);
  78. for (char i = '0'; i <= '9'; i++)
  79. digits10[i] = (byte) (i - '0');
  80. digits16 = new byte['f' + 1];
  81. Arrays.fill(digits16, (byte) -1);
  82. for (char i = '0'; i <= '9'; i++)
  83. digits16[i] = (byte) (i - '0');
  84. for (char i = 'a'; i <= 'f'; i++)
  85. digits16[i] = (byte) ((i - 'a') + 10);
  86. for (char i = 'A'; i <= 'F'; i++)
  87. digits16[i] = (byte) ((i - 'A') + 10);
  88. footerLineKeyChars = new byte['z' + 1];
  89. footerLineKeyChars['-'] = 1;
  90. for (char i = '0'; i <= '9'; i++)
  91. footerLineKeyChars[i] = 1;
  92. for (char i = 'A'; i <= 'Z'; i++)
  93. footerLineKeyChars[i] = 1;
  94. for (char i = 'a'; i <= 'z'; i++)
  95. footerLineKeyChars[i] = 1;
  96. }
  97. /**
  98. * Determine if b[ptr] matches src.
  99. *
  100. * @param b
  101. * the buffer to scan.
  102. * @param ptr
  103. * first position within b, this should match src[0].
  104. * @param src
  105. * the buffer to test for equality with b.
  106. * @return ptr + src.length if b[ptr..src.length] == src; else -1.
  107. */
  108. public static final int match(final byte[] b, int ptr, final byte[] src) {
  109. if (ptr + src.length > b.length)
  110. return -1;
  111. for (int i = 0; i < src.length; i++, ptr++)
  112. if (b[ptr] != src[i])
  113. return -1;
  114. return ptr;
  115. }
  116. private static final byte[] base10byte = { '0', '1', '2', '3', '4', '5',
  117. '6', '7', '8', '9' };
  118. /**
  119. * Format a base 10 numeric into a temporary buffer.
  120. * <p>
  121. * Formatting is performed backwards. The method starts at offset
  122. * <code>o-1</code> and ends at <code>o-1-digits</code>, where
  123. * <code>digits</code> is the number of positions necessary to store the
  124. * base 10 value.
  125. * <p>
  126. * The argument and return values from this method make it easy to chain
  127. * writing, for example:
  128. * </p>
  129. *
  130. * <pre>
  131. * final byte[] tmp = new byte[64];
  132. * int ptr = tmp.length;
  133. * tmp[--ptr] = '\n';
  134. * ptr = RawParseUtils.formatBase10(tmp, ptr, 32);
  135. * tmp[--ptr] = ' ';
  136. * ptr = RawParseUtils.formatBase10(tmp, ptr, 18);
  137. * tmp[--ptr] = 0;
  138. * final String str = new String(tmp, ptr, tmp.length - ptr);
  139. * </pre>
  140. *
  141. * @param b
  142. * buffer to write into.
  143. * @param o
  144. * one offset past the location where writing will begin; writing
  145. * proceeds towards lower index values.
  146. * @param value
  147. * the value to store.
  148. * @return the new offset value <code>o</code>. This is the position of
  149. * the last byte written. Additional writing should start at one
  150. * position earlier.
  151. */
  152. public static int formatBase10(final byte[] b, int o, int value) {
  153. if (value == 0) {
  154. b[--o] = '0';
  155. return o;
  156. }
  157. final boolean isneg = value < 0;
  158. if (isneg)
  159. value = -value;
  160. while (value != 0) {
  161. b[--o] = base10byte[value % 10];
  162. value /= 10;
  163. }
  164. if (isneg)
  165. b[--o] = '-';
  166. return o;
  167. }
  168. /**
  169. * Parse a base 10 numeric from a sequence of ASCII digits into an int.
  170. * <p>
  171. * Digit sequences can begin with an optional run of spaces before the
  172. * sequence, and may start with a '+' or a '-' to indicate sign position.
  173. * Any other characters will cause the method to stop and return the current
  174. * result to the caller.
  175. *
  176. * @param b
  177. * buffer to scan.
  178. * @param ptr
  179. * position within buffer to start parsing digits at.
  180. * @param ptrResult
  181. * optional location to return the new ptr value through. If null
  182. * the ptr value will be discarded.
  183. * @return the value at this location; 0 if the location is not a valid
  184. * numeric.
  185. */
  186. public static final int parseBase10(final byte[] b, int ptr,
  187. final MutableInteger ptrResult) {
  188. int r = 0;
  189. int sign = 0;
  190. try {
  191. final int sz = b.length;
  192. while (ptr < sz && b[ptr] == ' ')
  193. ptr++;
  194. if (ptr >= sz)
  195. return 0;
  196. switch (b[ptr]) {
  197. case '-':
  198. sign = -1;
  199. ptr++;
  200. break;
  201. case '+':
  202. ptr++;
  203. break;
  204. }
  205. while (ptr < sz) {
  206. final byte v = digits10[b[ptr]];
  207. if (v < 0)
  208. break;
  209. r = (r * 10) + v;
  210. ptr++;
  211. }
  212. } catch (ArrayIndexOutOfBoundsException e) {
  213. // Not a valid digit.
  214. }
  215. if (ptrResult != null)
  216. ptrResult.value = ptr;
  217. return sign < 0 ? -r : r;
  218. }
  219. /**
  220. * Parse a base 10 numeric from a sequence of ASCII digits into a long.
  221. * <p>
  222. * Digit sequences can begin with an optional run of spaces before the
  223. * sequence, and may start with a '+' or a '-' to indicate sign position.
  224. * Any other characters will cause the method to stop and return the current
  225. * result to the caller.
  226. *
  227. * @param b
  228. * buffer to scan.
  229. * @param ptr
  230. * position within buffer to start parsing digits at.
  231. * @param ptrResult
  232. * optional location to return the new ptr value through. If null
  233. * the ptr value will be discarded.
  234. * @return the value at this location; 0 if the location is not a valid
  235. * numeric.
  236. */
  237. public static final long parseLongBase10(final byte[] b, int ptr,
  238. final MutableInteger ptrResult) {
  239. long r = 0;
  240. int sign = 0;
  241. try {
  242. final int sz = b.length;
  243. while (ptr < sz && b[ptr] == ' ')
  244. ptr++;
  245. if (ptr >= sz)
  246. return 0;
  247. switch (b[ptr]) {
  248. case '-':
  249. sign = -1;
  250. ptr++;
  251. break;
  252. case '+':
  253. ptr++;
  254. break;
  255. }
  256. while (ptr < sz) {
  257. final byte v = digits10[b[ptr]];
  258. if (v < 0)
  259. break;
  260. r = (r * 10) + v;
  261. ptr++;
  262. }
  263. } catch (ArrayIndexOutOfBoundsException e) {
  264. // Not a valid digit.
  265. }
  266. if (ptrResult != null)
  267. ptrResult.value = ptr;
  268. return sign < 0 ? -r : r;
  269. }
  270. /**
  271. * Parse 4 character base 16 (hex) formatted string to unsigned integer.
  272. * <p>
  273. * The number is read in network byte order, that is, most significant
  274. * nybble first.
  275. *
  276. * @param bs
  277. * buffer to parse digits from; positions {@code [p, p+4)} will
  278. * be parsed.
  279. * @param p
  280. * first position within the buffer to parse.
  281. * @return the integer value.
  282. * @throws ArrayIndexOutOfBoundsException
  283. * if the string is not hex formatted.
  284. */
  285. public static final int parseHexInt16(final byte[] bs, final int p) {
  286. int r = digits16[bs[p]] << 4;
  287. r |= digits16[bs[p + 1]];
  288. r <<= 4;
  289. r |= digits16[bs[p + 2]];
  290. r <<= 4;
  291. r |= digits16[bs[p + 3]];
  292. if (r < 0)
  293. throw new ArrayIndexOutOfBoundsException();
  294. return r;
  295. }
  296. /**
  297. * Parse 8 character base 16 (hex) formatted string to unsigned integer.
  298. * <p>
  299. * The number is read in network byte order, that is, most significant
  300. * nybble first.
  301. *
  302. * @param bs
  303. * buffer to parse digits from; positions {@code [p, p+8)} will
  304. * be parsed.
  305. * @param p
  306. * first position within the buffer to parse.
  307. * @return the integer value.
  308. * @throws ArrayIndexOutOfBoundsException
  309. * if the string is not hex formatted.
  310. */
  311. public static final int parseHexInt32(final byte[] bs, final int p) {
  312. int r = digits16[bs[p]] << 4;
  313. r |= digits16[bs[p + 1]];
  314. r <<= 4;
  315. r |= digits16[bs[p + 2]];
  316. r <<= 4;
  317. r |= digits16[bs[p + 3]];
  318. r <<= 4;
  319. r |= digits16[bs[p + 4]];
  320. r <<= 4;
  321. r |= digits16[bs[p + 5]];
  322. r <<= 4;
  323. r |= digits16[bs[p + 6]];
  324. final int last = digits16[bs[p + 7]];
  325. if (r < 0 || last < 0)
  326. throw new ArrayIndexOutOfBoundsException();
  327. return (r << 4) | last;
  328. }
  329. /**
  330. * Parse a single hex digit to its numeric value (0-15).
  331. *
  332. * @param digit
  333. * hex character to parse.
  334. * @return numeric value, in the range 0-15.
  335. * @throws ArrayIndexOutOfBoundsException
  336. * if the input digit is not a valid hex digit.
  337. */
  338. public static final int parseHexInt4(final byte digit) {
  339. final byte r = digits16[digit];
  340. if (r < 0)
  341. throw new ArrayIndexOutOfBoundsException();
  342. return r;
  343. }
  344. /**
  345. * Parse a Git style timezone string.
  346. * <p>
  347. * The sequence "-0315" will be parsed as the numeric value -195, as the
  348. * lower two positions count minutes, not 100ths of an hour.
  349. *
  350. * @param b
  351. * buffer to scan.
  352. * @param ptr
  353. * position within buffer to start parsing digits at.
  354. * @return the timezone at this location, expressed in minutes.
  355. */
  356. public static final int parseTimeZoneOffset(final byte[] b, int ptr) {
  357. final int v = parseBase10(b, ptr, null);
  358. final int tzMins = v % 100;
  359. final int tzHours = v / 100;
  360. return tzHours * 60 + tzMins;
  361. }
  362. /**
  363. * Locate the first position after a given character.
  364. *
  365. * @param b
  366. * buffer to scan.
  367. * @param ptr
  368. * position within buffer to start looking for chrA at.
  369. * @param chrA
  370. * character to find.
  371. * @return new position just after chrA.
  372. */
  373. public static final int next(final byte[] b, int ptr, final char chrA) {
  374. final int sz = b.length;
  375. while (ptr < sz) {
  376. if (b[ptr++] == chrA)
  377. return ptr;
  378. }
  379. return ptr;
  380. }
  381. /**
  382. * Locate the first position after the next LF.
  383. * <p>
  384. * This method stops on the first '\n' it finds.
  385. *
  386. * @param b
  387. * buffer to scan.
  388. * @param ptr
  389. * position within buffer to start looking for LF at.
  390. * @return new position just after the first LF found.
  391. */
  392. public static final int nextLF(final byte[] b, int ptr) {
  393. return next(b, ptr, '\n');
  394. }
  395. /**
  396. * Locate the first position after either the given character or LF.
  397. * <p>
  398. * This method stops on the first match it finds from either chrA or '\n'.
  399. *
  400. * @param b
  401. * buffer to scan.
  402. * @param ptr
  403. * position within buffer to start looking for chrA or LF at.
  404. * @param chrA
  405. * character to find.
  406. * @return new position just after the first chrA or LF to be found.
  407. */
  408. public static final int nextLF(final byte[] b, int ptr, final char chrA) {
  409. final int sz = b.length;
  410. while (ptr < sz) {
  411. final byte c = b[ptr++];
  412. if (c == chrA || c == '\n')
  413. return ptr;
  414. }
  415. return ptr;
  416. }
  417. /**
  418. * Locate the first position before a given character.
  419. *
  420. * @param b
  421. * buffer to scan.
  422. * @param ptr
  423. * position within buffer to start looking for chrA at.
  424. * @param chrA
  425. * character to find.
  426. * @return new position just before chrA, -1 for not found
  427. */
  428. public static final int prev(final byte[] b, int ptr, final char chrA) {
  429. if (ptr == b.length)
  430. --ptr;
  431. while (ptr >= 0) {
  432. if (b[ptr--] == chrA)
  433. return ptr;
  434. }
  435. return ptr;
  436. }
  437. /**
  438. * Locate the first position before the previous LF.
  439. * <p>
  440. * This method stops on the first '\n' it finds.
  441. *
  442. * @param b
  443. * buffer to scan.
  444. * @param ptr
  445. * position within buffer to start looking for LF at.
  446. * @return new position just before the first LF found, -1 for not found
  447. */
  448. public static final int prevLF(final byte[] b, int ptr) {
  449. return prev(b, ptr, '\n');
  450. }
  451. /**
  452. * Locate the previous position before either the given character or LF.
  453. * <p>
  454. * This method stops on the first match it finds from either chrA or '\n'.
  455. *
  456. * @param b
  457. * buffer to scan.
  458. * @param ptr
  459. * position within buffer to start looking for chrA or LF at.
  460. * @param chrA
  461. * character to find.
  462. * @return new position just before the first chrA or LF to be found, -1 for
  463. * not found
  464. */
  465. public static final int prevLF(final byte[] b, int ptr, final char chrA) {
  466. if (ptr == b.length)
  467. --ptr;
  468. while (ptr >= 0) {
  469. final byte c = b[ptr--];
  470. if (c == chrA || c == '\n')
  471. return ptr;
  472. }
  473. return ptr;
  474. }
  475. /**
  476. * Index the region between <code>[ptr, end)</code> to find line starts.
  477. * <p>
  478. * The returned list is 1 indexed. Index 0 contains
  479. * {@link Integer#MIN_VALUE} to pad the list out.
  480. * <p>
  481. * Using a 1 indexed list means that line numbers can be directly accessed
  482. * from the list, so <code>list.get(1)</code> (aka get line 1) returns
  483. * <code>ptr</code>.
  484. * <p>
  485. * The last element (index <code>map.size()-1</code>) always contains
  486. * <code>end</code>.
  487. *
  488. * @param buf
  489. * buffer to scan.
  490. * @param ptr
  491. * position within the buffer corresponding to the first byte of
  492. * line 1.
  493. * @param end
  494. * 1 past the end of the content within <code>buf</code>.
  495. * @return a line map indexing the start position of each line.
  496. */
  497. public static final IntList lineMap(final byte[] buf, int ptr, int end) {
  498. // Experimentally derived from multiple source repositories
  499. // the average number of bytes/line is 36. Its a rough guess
  500. // to initially size our map close to the target.
  501. //
  502. final IntList map = new IntList((end - ptr) / 36);
  503. map.fillTo(1, Integer.MIN_VALUE);
  504. for (; ptr < end; ptr = nextLF(buf, ptr))
  505. map.add(ptr);
  506. map.add(end);
  507. return map;
  508. }
  509. /**
  510. * Locate the "author " header line data.
  511. *
  512. * @param b
  513. * buffer to scan.
  514. * @param ptr
  515. * position in buffer to start the scan at. Most callers should
  516. * pass 0 to ensure the scan starts from the beginning of the
  517. * commit buffer and does not accidentally look at message body.
  518. * @return position just after the space in "author ", so the first
  519. * character of the author's name. If no author header can be
  520. * located -1 is returned.
  521. */
  522. public static final int author(final byte[] b, int ptr) {
  523. final int sz = b.length;
  524. if (ptr == 0)
  525. ptr += 46; // skip the "tree ..." line.
  526. while (ptr < sz && b[ptr] == 'p')
  527. ptr += 48; // skip this parent.
  528. return match(b, ptr, author);
  529. }
  530. /**
  531. * Locate the "committer " header line data.
  532. *
  533. * @param b
  534. * buffer to scan.
  535. * @param ptr
  536. * position in buffer to start the scan at. Most callers should
  537. * pass 0 to ensure the scan starts from the beginning of the
  538. * commit buffer and does not accidentally look at message body.
  539. * @return position just after the space in "committer ", so the first
  540. * character of the committer's name. If no committer header can be
  541. * located -1 is returned.
  542. */
  543. public static final int committer(final byte[] b, int ptr) {
  544. final int sz = b.length;
  545. if (ptr == 0)
  546. ptr += 46; // skip the "tree ..." line.
  547. while (ptr < sz && b[ptr] == 'p')
  548. ptr += 48; // skip this parent.
  549. if (ptr < sz && b[ptr] == 'a')
  550. ptr = nextLF(b, ptr);
  551. return match(b, ptr, committer);
  552. }
  553. /**
  554. * Locate the "tagger " header line data.
  555. *
  556. * @param b
  557. * buffer to scan.
  558. * @param ptr
  559. * position in buffer to start the scan at. Most callers should
  560. * pass 0 to ensure the scan starts from the beginning of the tag
  561. * buffer and does not accidentally look at message body.
  562. * @return position just after the space in "tagger ", so the first
  563. * character of the tagger's name. If no tagger header can be
  564. * located -1 is returned.
  565. */
  566. public static final int tagger(final byte[] b, int ptr) {
  567. final int sz = b.length;
  568. if (ptr == 0)
  569. ptr += 48; // skip the "object ..." line.
  570. while (ptr < sz) {
  571. if (b[ptr] == '\n')
  572. return -1;
  573. final int m = match(b, ptr, tagger);
  574. if (m >= 0)
  575. return m;
  576. ptr = nextLF(b, ptr);
  577. }
  578. return -1;
  579. }
  580. /**
  581. * Locate the "encoding " header line.
  582. *
  583. * @param b
  584. * buffer to scan.
  585. * @param ptr
  586. * position in buffer to start the scan at. Most callers should
  587. * pass 0 to ensure the scan starts from the beginning of the
  588. * buffer and does not accidentally look at the message body.
  589. * @return position just after the space in "encoding ", so the first
  590. * character of the encoding's name. If no encoding header can be
  591. * located -1 is returned (and UTF-8 should be assumed).
  592. */
  593. public static final int encoding(final byte[] b, int ptr) {
  594. final int sz = b.length;
  595. while (ptr < sz) {
  596. if (b[ptr] == '\n')
  597. return -1;
  598. if (b[ptr] == 'e')
  599. break;
  600. ptr = nextLF(b, ptr);
  601. }
  602. return match(b, ptr, encoding);
  603. }
  604. /**
  605. * Parse the "encoding " header into a character set reference.
  606. * <p>
  607. * Locates the "encoding " header (if present) by first calling
  608. * {@link #encoding(byte[], int)} and then returns the proper character set
  609. * to apply to this buffer to evaluate its contents as character data.
  610. * <p>
  611. * If no encoding header is present, {@link Constants#CHARSET} is assumed.
  612. *
  613. * @param b
  614. * buffer to scan.
  615. * @return the Java character set representation. Never null.
  616. */
  617. public static Charset parseEncoding(final byte[] b) {
  618. final int enc = encoding(b, 0);
  619. if (enc < 0)
  620. return Constants.CHARSET;
  621. final int lf = nextLF(b, enc);
  622. String decoded = decode(Constants.CHARSET, b, enc, lf - 1);
  623. try {
  624. return Charset.forName(decoded);
  625. } catch (IllegalCharsetNameException badName) {
  626. Charset aliased = charsetForAlias(decoded);
  627. if (aliased != null)
  628. return aliased;
  629. throw badName;
  630. } catch (UnsupportedCharsetException badName) {
  631. Charset aliased = charsetForAlias(decoded);
  632. if (aliased != null)
  633. return aliased;
  634. throw badName;
  635. }
  636. }
  637. /**
  638. * Parse a name string (e.g. author, committer, tagger) into a PersonIdent.
  639. * <p>
  640. * Leading spaces won't be trimmed from the string, i.e. will show up in the
  641. * parsed name afterwards.
  642. *
  643. * @param in
  644. * the string to parse a name from.
  645. * @return the parsed identity or null in case the identity could not be
  646. * parsed.
  647. */
  648. public static PersonIdent parsePersonIdent(final String in) {
  649. return parsePersonIdent(Constants.encode(in), 0);
  650. }
  651. /**
  652. * Parse a name line (e.g. author, committer, tagger) into a PersonIdent.
  653. * <p>
  654. * When passing in a value for <code>nameB</code> callers should use the
  655. * return value of {@link #author(byte[], int)} or
  656. * {@link #committer(byte[], int)}, as these methods provide the proper
  657. * position within the buffer.
  658. *
  659. * @param raw
  660. * the buffer to parse character data from.
  661. * @param nameB
  662. * first position of the identity information. This should be the
  663. * first position after the space which delimits the header field
  664. * name (e.g. "author" or "committer") from the rest of the
  665. * identity line.
  666. * @return the parsed identity or null in case the identity could not be
  667. * parsed.
  668. */
  669. public static PersonIdent parsePersonIdent(final byte[] raw, final int nameB) {
  670. final Charset cs = parseEncoding(raw);
  671. final int emailB = nextLF(raw, nameB, '<');
  672. final int emailE = nextLF(raw, emailB, '>');
  673. if (emailB >= raw.length || raw[emailB] == '\n' ||
  674. (emailE >= raw.length - 1 && raw[emailE - 1] != '>'))
  675. return null;
  676. final int nameEnd = emailB - 2 >= nameB && raw[emailB - 2] == ' ' ?
  677. emailB - 2 : emailB - 1;
  678. final String name = decode(cs, raw, nameB, nameEnd);
  679. final String email = decode(cs, raw, emailB, emailE - 1);
  680. // Start searching from end of line, as after first name-email pair,
  681. // another name-email pair may occur. We will ignore all kinds of
  682. // "junk" following the first email.
  683. //
  684. // We've to use (emailE - 1) for the case that raw[email] is LF,
  685. // otherwise we would run too far. "-2" is necessary to position
  686. // before the LF in case of LF termination resp. the penultimate
  687. // character if there is no trailing LF.
  688. final int tzBegin = lastIndexOfTrim(raw, ' ',
  689. nextLF(raw, emailE - 1) - 2) + 1;
  690. if (tzBegin <= emailE) // No time/zone, still valid
  691. return new PersonIdent(name, email, 0, 0);
  692. final int whenBegin = Math.max(emailE,
  693. lastIndexOfTrim(raw, ' ', tzBegin - 1) + 1);
  694. if (whenBegin >= tzBegin - 1) // No time/zone, still valid
  695. return new PersonIdent(name, email, 0, 0);
  696. final long when = parseLongBase10(raw, whenBegin, null);
  697. final int tz = parseTimeZoneOffset(raw, tzBegin);
  698. return new PersonIdent(name, email, when * 1000L, tz);
  699. }
  700. /**
  701. * Parse a name data (e.g. as within a reflog) into a PersonIdent.
  702. * <p>
  703. * When passing in a value for <code>nameB</code> callers should use the
  704. * return value of {@link #author(byte[], int)} or
  705. * {@link #committer(byte[], int)}, as these methods provide the proper
  706. * position within the buffer.
  707. *
  708. * @param raw
  709. * the buffer to parse character data from.
  710. * @param nameB
  711. * first position of the identity information. This should be the
  712. * first position after the space which delimits the header field
  713. * name (e.g. "author" or "committer") from the rest of the
  714. * identity line.
  715. * @return the parsed identity. Never null.
  716. */
  717. public static PersonIdent parsePersonIdentOnly(final byte[] raw,
  718. final int nameB) {
  719. int stop = nextLF(raw, nameB);
  720. int emailB = nextLF(raw, nameB, '<');
  721. int emailE = nextLF(raw, emailB, '>');
  722. final String name;
  723. final String email;
  724. if (emailE < stop) {
  725. email = decode(raw, emailB, emailE - 1);
  726. } else {
  727. email = "invalid"; //$NON-NLS-1$
  728. }
  729. if (emailB < stop)
  730. name = decode(raw, nameB, emailB - 2);
  731. else
  732. name = decode(raw, nameB, stop);
  733. final MutableInteger ptrout = new MutableInteger();
  734. long when;
  735. int tz;
  736. if (emailE < stop) {
  737. when = parseLongBase10(raw, emailE + 1, ptrout);
  738. tz = parseTimeZoneOffset(raw, ptrout.value);
  739. } else {
  740. when = 0;
  741. tz = 0;
  742. }
  743. return new PersonIdent(name, email, when * 1000L, tz);
  744. }
  745. /**
  746. * Locate the end of a footer line key string.
  747. * <p>
  748. * If the region at {@code raw[ptr]} matches {@code ^[A-Za-z0-9-]+:} (e.g.
  749. * "Signed-off-by: A. U. Thor\n") then this method returns the position of
  750. * the first ':'.
  751. * <p>
  752. * If the region at {@code raw[ptr]} does not match {@code ^[A-Za-z0-9-]+:}
  753. * then this method returns -1.
  754. *
  755. * @param raw
  756. * buffer to scan.
  757. * @param ptr
  758. * first position within raw to consider as a footer line key.
  759. * @return position of the ':' which terminates the footer line key if this
  760. * is otherwise a valid footer line key; otherwise -1.
  761. */
  762. public static int endOfFooterLineKey(final byte[] raw, int ptr) {
  763. try {
  764. for (;;) {
  765. final byte c = raw[ptr];
  766. if (footerLineKeyChars[c] == 0) {
  767. if (c == ':')
  768. return ptr;
  769. return -1;
  770. }
  771. ptr++;
  772. }
  773. } catch (ArrayIndexOutOfBoundsException e) {
  774. return -1;
  775. }
  776. }
  777. /**
  778. * Decode a buffer under UTF-8, if possible.
  779. *
  780. * If the byte stream cannot be decoded that way, the platform default is tried
  781. * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
  782. *
  783. * @param buffer
  784. * buffer to pull raw bytes from.
  785. * @return a string representation of the range <code>[start,end)</code>,
  786. * after decoding the region through the specified character set.
  787. */
  788. public static String decode(final byte[] buffer) {
  789. return decode(buffer, 0, buffer.length);
  790. }
  791. /**
  792. * Decode a buffer under UTF-8, if possible.
  793. *
  794. * If the byte stream cannot be decoded that way, the platform default is
  795. * tried and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
  796. *
  797. * @param buffer
  798. * buffer to pull raw bytes from.
  799. * @param start
  800. * start position in buffer
  801. * @param end
  802. * one position past the last location within the buffer to take
  803. * data from.
  804. * @return a string representation of the range <code>[start,end)</code>,
  805. * after decoding the region through the specified character set.
  806. */
  807. public static String decode(final byte[] buffer, final int start,
  808. final int end) {
  809. return decode(Constants.CHARSET, buffer, start, end);
  810. }
  811. /**
  812. * Decode a buffer under the specified character set if possible.
  813. *
  814. * If the byte stream cannot be decoded that way, the platform default is tried
  815. * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
  816. *
  817. * @param cs
  818. * character set to use when decoding the buffer.
  819. * @param buffer
  820. * buffer to pull raw bytes from.
  821. * @return a string representation of the range <code>[start,end)</code>,
  822. * after decoding the region through the specified character set.
  823. */
  824. public static String decode(final Charset cs, final byte[] buffer) {
  825. return decode(cs, buffer, 0, buffer.length);
  826. }
  827. /**
  828. * Decode a region of the buffer under the specified character set if possible.
  829. *
  830. * If the byte stream cannot be decoded that way, the platform default is tried
  831. * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
  832. *
  833. * @param cs
  834. * character set to use when decoding the buffer.
  835. * @param buffer
  836. * buffer to pull raw bytes from.
  837. * @param start
  838. * first position within the buffer to take data from.
  839. * @param end
  840. * one position past the last location within the buffer to take
  841. * data from.
  842. * @return a string representation of the range <code>[start,end)</code>,
  843. * after decoding the region through the specified character set.
  844. */
  845. public static String decode(final Charset cs, final byte[] buffer,
  846. final int start, final int end) {
  847. try {
  848. return decodeNoFallback(cs, buffer, start, end);
  849. } catch (CharacterCodingException e) {
  850. // Fall back to an ISO-8859-1 style encoding. At least all of
  851. // the bytes will be present in the output.
  852. //
  853. return extractBinaryString(buffer, start, end);
  854. }
  855. }
  856. /**
  857. * Decode a region of the buffer under the specified character set if
  858. * possible.
  859. *
  860. * If the byte stream cannot be decoded that way, the platform default is
  861. * tried and if that too fails, an exception is thrown.
  862. *
  863. * @param cs
  864. * character set to use when decoding the buffer.
  865. * @param buffer
  866. * buffer to pull raw bytes from.
  867. * @param start
  868. * first position within the buffer to take data from.
  869. * @param end
  870. * one position past the last location within the buffer to take
  871. * data from.
  872. * @return a string representation of the range <code>[start,end)</code>,
  873. * after decoding the region through the specified character set.
  874. * @throws CharacterCodingException
  875. * the input is not in any of the tested character sets.
  876. */
  877. public static String decodeNoFallback(final Charset cs,
  878. final byte[] buffer, final int start, final int end)
  879. throws CharacterCodingException {
  880. final ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start);
  881. b.mark();
  882. // Try our built-in favorite. The assumption here is that
  883. // decoding will fail if the data is not actually encoded
  884. // using that encoder.
  885. //
  886. try {
  887. return decode(b, Constants.CHARSET);
  888. } catch (CharacterCodingException e) {
  889. b.reset();
  890. }
  891. if (!cs.equals(Constants.CHARSET)) {
  892. // Try the suggested encoding, it might be right since it was
  893. // provided by the caller.
  894. //
  895. try {
  896. return decode(b, cs);
  897. } catch (CharacterCodingException e) {
  898. b.reset();
  899. }
  900. }
  901. // Try the default character set. A small group of people
  902. // might actually use the same (or very similar) locale.
  903. //
  904. final Charset defcs = Charset.defaultCharset();
  905. if (!defcs.equals(cs) && !defcs.equals(Constants.CHARSET)) {
  906. try {
  907. return decode(b, defcs);
  908. } catch (CharacterCodingException e) {
  909. b.reset();
  910. }
  911. }
  912. throw new CharacterCodingException();
  913. }
  914. /**
  915. * Decode a region of the buffer under the ISO-8859-1 encoding.
  916. *
  917. * Each byte is treated as a single character in the 8859-1 character
  918. * encoding, performing a raw binary-&gt;char conversion.
  919. *
  920. * @param buffer
  921. * buffer to pull raw bytes from.
  922. * @param start
  923. * first position within the buffer to take data from.
  924. * @param end
  925. * one position past the last location within the buffer to take
  926. * data from.
  927. * @return a string representation of the range <code>[start,end)</code>.
  928. */
  929. public static String extractBinaryString(final byte[] buffer,
  930. final int start, final int end) {
  931. final StringBuilder r = new StringBuilder(end - start);
  932. for (int i = start; i < end; i++)
  933. r.append((char) (buffer[i] & 0xff));
  934. return r.toString();
  935. }
  936. private static String decode(final ByteBuffer b, final Charset charset)
  937. throws CharacterCodingException {
  938. final CharsetDecoder d = charset.newDecoder();
  939. d.onMalformedInput(CodingErrorAction.REPORT);
  940. d.onUnmappableCharacter(CodingErrorAction.REPORT);
  941. return d.decode(b).toString();
  942. }
  943. /**
  944. * Locate the position of the commit message body.
  945. *
  946. * @param b
  947. * buffer to scan.
  948. * @param ptr
  949. * position in buffer to start the scan at. Most callers should
  950. * pass 0 to ensure the scan starts from the beginning of the
  951. * commit buffer.
  952. * @return position of the user's message buffer.
  953. */
  954. public static final int commitMessage(final byte[] b, int ptr) {
  955. final int sz = b.length;
  956. if (ptr == 0)
  957. ptr += 46; // skip the "tree ..." line.
  958. while (ptr < sz && b[ptr] == 'p')
  959. ptr += 48; // skip this parent.
  960. // Skip any remaining header lines, ignoring what their actual
  961. // header line type is. This is identical to the logic for a tag.
  962. //
  963. return tagMessage(b, ptr);
  964. }
  965. /**
  966. * Locate the position of the tag message body.
  967. *
  968. * @param b
  969. * buffer to scan.
  970. * @param ptr
  971. * position in buffer to start the scan at. Most callers should
  972. * pass 0 to ensure the scan starts from the beginning of the tag
  973. * buffer.
  974. * @return position of the user's message buffer.
  975. */
  976. public static final int tagMessage(final byte[] b, int ptr) {
  977. final int sz = b.length;
  978. if (ptr == 0)
  979. ptr += 48; // skip the "object ..." line.
  980. while (ptr < sz && b[ptr] != '\n')
  981. ptr = nextLF(b, ptr);
  982. if (ptr < sz && b[ptr] == '\n')
  983. return ptr + 1;
  984. return -1;
  985. }
  986. /**
  987. * Locate the end of a paragraph.
  988. * <p>
  989. * A paragraph is ended by two consecutive LF bytes or CRLF pairs
  990. *
  991. * @param b
  992. * buffer to scan.
  993. * @param start
  994. * position in buffer to start the scan at. Most callers will
  995. * want to pass the first position of the commit message (as
  996. * found by {@link #commitMessage(byte[], int)}.
  997. * @return position of the LF at the end of the paragraph;
  998. * <code>b.length</code> if no paragraph end could be located.
  999. */
  1000. public static final int endOfParagraph(final byte[] b, final int start) {
  1001. int ptr = start;
  1002. final int sz = b.length;
  1003. while (ptr < sz && (b[ptr] != '\n' && b[ptr] != '\r'))
  1004. ptr = nextLF(b, ptr);
  1005. if (ptr > start && b[ptr - 1] == '\n')
  1006. ptr--;
  1007. if (ptr > start && b[ptr - 1] == '\r')
  1008. ptr--;
  1009. return ptr;
  1010. }
  1011. private static int lastIndexOfTrim(byte[] raw, char ch, int pos) {
  1012. while (pos >= 0 && raw[pos] == ' ')
  1013. pos--;
  1014. while (pos >= 0 && raw[pos] != ch)
  1015. pos--;
  1016. return pos;
  1017. }
  1018. private static Charset charsetForAlias(String name) {
  1019. return encodingAliases.get(StringUtils.toLowerCase(name));
  1020. }
  1021. private RawParseUtils() {
  1022. // Don't create instances of a static only utility.
  1023. }
  1024. }