You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

RawParseUtils.java 31KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016
  1. /*
  2. * Copyright (C) 2008-2009, Google Inc.
  3. * Copyright (C) 2006-2008, Shawn O. Pearce <spearce@spearce.org>
  4. * and other copyright owners as documented in the project's IP log.
  5. *
  6. * This program and the accompanying materials are made available
  7. * under the terms of the Eclipse Distribution License v1.0 which
  8. * accompanies this distribution, is reproduced below, and is
  9. * available at http://www.eclipse.org/org/documents/edl-v10.php
  10. *
  11. * All rights reserved.
  12. *
  13. * Redistribution and use in source and binary forms, with or
  14. * without modification, are permitted provided that the following
  15. * conditions are met:
  16. *
  17. * - Redistributions of source code must retain the above copyright
  18. * notice, this list of conditions and the following disclaimer.
  19. *
  20. * - Redistributions in binary form must reproduce the above
  21. * copyright notice, this list of conditions and the following
  22. * disclaimer in the documentation and/or other materials provided
  23. * with the distribution.
  24. *
  25. * - Neither the name of the Eclipse Foundation, Inc. nor the
  26. * names of its contributors may be used to endorse or promote
  27. * products derived from this software without specific prior
  28. * written permission.
  29. *
  30. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
  31. * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
  32. * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  33. * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  34. * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
  35. * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  36. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  37. * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  38. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  39. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  40. * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  41. * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  42. * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  43. */
  44. package org.eclipse.jgit.util;
  45. import static org.eclipse.jgit.lib.ObjectChecker.author;
  46. import static org.eclipse.jgit.lib.ObjectChecker.committer;
  47. import static org.eclipse.jgit.lib.ObjectChecker.encoding;
  48. import static org.eclipse.jgit.lib.ObjectChecker.tagger;
  49. import java.nio.ByteBuffer;
  50. import java.nio.charset.CharacterCodingException;
  51. import java.nio.charset.Charset;
  52. import java.nio.charset.CharsetDecoder;
  53. import java.nio.charset.CodingErrorAction;
  54. import java.util.Arrays;
  55. import org.eclipse.jgit.lib.Constants;
  56. import org.eclipse.jgit.lib.PersonIdent;
  57. /** Handy utility functions to parse raw object contents. */
  58. public final class RawParseUtils {
  59. private static final byte[] digits10;
  60. private static final byte[] digits16;
  61. private static final byte[] footerLineKeyChars;
  62. static {
  63. digits10 = new byte['9' + 1];
  64. Arrays.fill(digits10, (byte) -1);
  65. for (char i = '0'; i <= '9'; i++)
  66. digits10[i] = (byte) (i - '0');
  67. digits16 = new byte['f' + 1];
  68. Arrays.fill(digits16, (byte) -1);
  69. for (char i = '0'; i <= '9'; i++)
  70. digits16[i] = (byte) (i - '0');
  71. for (char i = 'a'; i <= 'f'; i++)
  72. digits16[i] = (byte) ((i - 'a') + 10);
  73. for (char i = 'A'; i <= 'F'; i++)
  74. digits16[i] = (byte) ((i - 'A') + 10);
  75. footerLineKeyChars = new byte['z' + 1];
  76. footerLineKeyChars['-'] = 1;
  77. for (char i = '0'; i <= '9'; i++)
  78. footerLineKeyChars[i] = 1;
  79. for (char i = 'A'; i <= 'Z'; i++)
  80. footerLineKeyChars[i] = 1;
  81. for (char i = 'a'; i <= 'z'; i++)
  82. footerLineKeyChars[i] = 1;
  83. }
  84. /**
  85. * Determine if b[ptr] matches src.
  86. *
  87. * @param b
  88. * the buffer to scan.
  89. * @param ptr
  90. * first position within b, this should match src[0].
  91. * @param src
  92. * the buffer to test for equality with b.
  93. * @return ptr + src.length if b[ptr..src.length] == src; else -1.
  94. */
  95. public static final int match(final byte[] b, int ptr, final byte[] src) {
  96. if (ptr + src.length > b.length)
  97. return -1;
  98. for (int i = 0; i < src.length; i++, ptr++)
  99. if (b[ptr] != src[i])
  100. return -1;
  101. return ptr;
  102. }
  103. private static final byte[] base10byte = { '0', '1', '2', '3', '4', '5',
  104. '6', '7', '8', '9' };
  105. /**
  106. * Format a base 10 numeric into a temporary buffer.
  107. * <p>
  108. * Formatting is performed backwards. The method starts at offset
  109. * <code>o-1</code> and ends at <code>o-1-digits</code>, where
  110. * <code>digits</code> is the number of positions necessary to store the
  111. * base 10 value.
  112. * <p>
  113. * The argument and return values from this method make it easy to chain
  114. * writing, for example:
  115. * </p>
  116. *
  117. * <pre>
  118. * final byte[] tmp = new byte[64];
  119. * int ptr = tmp.length;
  120. * tmp[--ptr] = '\n';
  121. * ptr = RawParseUtils.formatBase10(tmp, ptr, 32);
  122. * tmp[--ptr] = ' ';
  123. * ptr = RawParseUtils.formatBase10(tmp, ptr, 18);
  124. * tmp[--ptr] = 0;
  125. * final String str = new String(tmp, ptr, tmp.length - ptr);
  126. * </pre>
  127. *
  128. * @param b
  129. * buffer to write into.
  130. * @param o
  131. * one offset past the location where writing will begin; writing
  132. * proceeds towards lower index values.
  133. * @param value
  134. * the value to store.
  135. * @return the new offset value <code>o</code>. This is the position of
  136. * the last byte written. Additional writing should start at one
  137. * position earlier.
  138. */
  139. public static int formatBase10(final byte[] b, int o, int value) {
  140. if (value == 0) {
  141. b[--o] = '0';
  142. return o;
  143. }
  144. final boolean isneg = value < 0;
  145. while (value != 0) {
  146. b[--o] = base10byte[value % 10];
  147. value /= 10;
  148. }
  149. if (isneg)
  150. b[--o] = '-';
  151. return o;
  152. }
  153. /**
  154. * Parse a base 10 numeric from a sequence of ASCII digits into an int.
  155. * <p>
  156. * Digit sequences can begin with an optional run of spaces before the
  157. * sequence, and may start with a '+' or a '-' to indicate sign position.
  158. * Any other characters will cause the method to stop and return the current
  159. * result to the caller.
  160. *
  161. * @param b
  162. * buffer to scan.
  163. * @param ptr
  164. * position within buffer to start parsing digits at.
  165. * @param ptrResult
  166. * optional location to return the new ptr value through. If null
  167. * the ptr value will be discarded.
  168. * @return the value at this location; 0 if the location is not a valid
  169. * numeric.
  170. */
  171. public static final int parseBase10(final byte[] b, int ptr,
  172. final MutableInteger ptrResult) {
  173. int r = 0;
  174. int sign = 0;
  175. try {
  176. final int sz = b.length;
  177. while (ptr < sz && b[ptr] == ' ')
  178. ptr++;
  179. if (ptr >= sz)
  180. return 0;
  181. switch (b[ptr]) {
  182. case '-':
  183. sign = -1;
  184. ptr++;
  185. break;
  186. case '+':
  187. ptr++;
  188. break;
  189. }
  190. while (ptr < sz) {
  191. final byte v = digits10[b[ptr]];
  192. if (v < 0)
  193. break;
  194. r = (r * 10) + v;
  195. ptr++;
  196. }
  197. } catch (ArrayIndexOutOfBoundsException e) {
  198. // Not a valid digit.
  199. }
  200. if (ptrResult != null)
  201. ptrResult.value = ptr;
  202. return sign < 0 ? -r : r;
  203. }
  204. /**
  205. * Parse a base 10 numeric from a sequence of ASCII digits into a long.
  206. * <p>
  207. * Digit sequences can begin with an optional run of spaces before the
  208. * sequence, and may start with a '+' or a '-' to indicate sign position.
  209. * Any other characters will cause the method to stop and return the current
  210. * result to the caller.
  211. *
  212. * @param b
  213. * buffer to scan.
  214. * @param ptr
  215. * position within buffer to start parsing digits at.
  216. * @param ptrResult
  217. * optional location to return the new ptr value through. If null
  218. * the ptr value will be discarded.
  219. * @return the value at this location; 0 if the location is not a valid
  220. * numeric.
  221. */
  222. public static final long parseLongBase10(final byte[] b, int ptr,
  223. final MutableInteger ptrResult) {
  224. long r = 0;
  225. int sign = 0;
  226. try {
  227. final int sz = b.length;
  228. while (ptr < sz && b[ptr] == ' ')
  229. ptr++;
  230. if (ptr >= sz)
  231. return 0;
  232. switch (b[ptr]) {
  233. case '-':
  234. sign = -1;
  235. ptr++;
  236. break;
  237. case '+':
  238. ptr++;
  239. break;
  240. }
  241. while (ptr < sz) {
  242. final byte v = digits10[b[ptr]];
  243. if (v < 0)
  244. break;
  245. r = (r * 10) + v;
  246. ptr++;
  247. }
  248. } catch (ArrayIndexOutOfBoundsException e) {
  249. // Not a valid digit.
  250. }
  251. if (ptrResult != null)
  252. ptrResult.value = ptr;
  253. return sign < 0 ? -r : r;
  254. }
  255. /**
  256. * Parse 4 character base 16 (hex) formatted string to unsigned integer.
  257. * <p>
  258. * The number is read in network byte order, that is, most significant
  259. * nybble first.
  260. *
  261. * @param bs
  262. * buffer to parse digits from; positions {@code [p, p+4)} will
  263. * be parsed.
  264. * @param p
  265. * first position within the buffer to parse.
  266. * @return the integer value.
  267. * @throws ArrayIndexOutOfBoundsException
  268. * if the string is not hex formatted.
  269. */
  270. public static final int parseHexInt16(final byte[] bs, final int p) {
  271. int r = digits16[bs[p]] << 4;
  272. r |= digits16[bs[p + 1]];
  273. r <<= 4;
  274. r |= digits16[bs[p + 2]];
  275. r <<= 4;
  276. r |= digits16[bs[p + 3]];
  277. if (r < 0)
  278. throw new ArrayIndexOutOfBoundsException();
  279. return r;
  280. }
  281. /**
  282. * Parse 8 character base 16 (hex) formatted string to unsigned integer.
  283. * <p>
  284. * The number is read in network byte order, that is, most significant
  285. * nybble first.
  286. *
  287. * @param bs
  288. * buffer to parse digits from; positions {@code [p, p+8)} will
  289. * be parsed.
  290. * @param p
  291. * first position within the buffer to parse.
  292. * @return the integer value.
  293. * @throws ArrayIndexOutOfBoundsException
  294. * if the string is not hex formatted.
  295. */
  296. public static final int parseHexInt32(final byte[] bs, final int p) {
  297. int r = digits16[bs[p]] << 4;
  298. r |= digits16[bs[p + 1]];
  299. r <<= 4;
  300. r |= digits16[bs[p + 2]];
  301. r <<= 4;
  302. r |= digits16[bs[p + 3]];
  303. r <<= 4;
  304. r |= digits16[bs[p + 4]];
  305. r <<= 4;
  306. r |= digits16[bs[p + 5]];
  307. r <<= 4;
  308. r |= digits16[bs[p + 6]];
  309. final int last = digits16[bs[p + 7]];
  310. if (r < 0 || last < 0)
  311. throw new ArrayIndexOutOfBoundsException();
  312. return (r << 4) | last;
  313. }
  314. /**
  315. * Parse a single hex digit to its numeric value (0-15).
  316. *
  317. * @param digit
  318. * hex character to parse.
  319. * @return numeric value, in the range 0-15.
  320. * @throws ArrayIndexOutOfBoundsException
  321. * if the input digit is not a valid hex digit.
  322. */
  323. public static final int parseHexInt4(final byte digit) {
  324. final byte r = digits16[digit];
  325. if (r < 0)
  326. throw new ArrayIndexOutOfBoundsException();
  327. return r;
  328. }
  329. /**
  330. * Parse a Git style timezone string.
  331. * <p>
  332. * The sequence "-0315" will be parsed as the numeric value -195, as the
  333. * lower two positions count minutes, not 100ths of an hour.
  334. *
  335. * @param b
  336. * buffer to scan.
  337. * @param ptr
  338. * position within buffer to start parsing digits at.
  339. * @return the timezone at this location, expressed in minutes.
  340. */
  341. public static final int parseTimeZoneOffset(final byte[] b, int ptr) {
  342. final int v = parseBase10(b, ptr, null);
  343. final int tzMins = v % 100;
  344. final int tzHours = v / 100;
  345. return tzHours * 60 + tzMins;
  346. }
  347. /**
  348. * Locate the first position after a given character.
  349. *
  350. * @param b
  351. * buffer to scan.
  352. * @param ptr
  353. * position within buffer to start looking for chrA at.
  354. * @param chrA
  355. * character to find.
  356. * @return new position just after chrA.
  357. */
  358. public static final int next(final byte[] b, int ptr, final char chrA) {
  359. final int sz = b.length;
  360. while (ptr < sz) {
  361. if (b[ptr++] == chrA)
  362. return ptr;
  363. }
  364. return ptr;
  365. }
  366. /**
  367. * Locate the first position after the next LF.
  368. * <p>
  369. * This method stops on the first '\n' it finds.
  370. *
  371. * @param b
  372. * buffer to scan.
  373. * @param ptr
  374. * position within buffer to start looking for LF at.
  375. * @return new position just after the first LF found.
  376. */
  377. public static final int nextLF(final byte[] b, int ptr) {
  378. return next(b, ptr, '\n');
  379. }
  380. /**
  381. * Locate the first position after either the given character or LF.
  382. * <p>
  383. * This method stops on the first match it finds from either chrA or '\n'.
  384. *
  385. * @param b
  386. * buffer to scan.
  387. * @param ptr
  388. * position within buffer to start looking for chrA or LF at.
  389. * @param chrA
  390. * character to find.
  391. * @return new position just after the first chrA or LF to be found.
  392. */
  393. public static final int nextLF(final byte[] b, int ptr, final char chrA) {
  394. final int sz = b.length;
  395. while (ptr < sz) {
  396. final byte c = b[ptr++];
  397. if (c == chrA || c == '\n')
  398. return ptr;
  399. }
  400. return ptr;
  401. }
  402. /**
  403. * Locate the first position before a given character.
  404. *
  405. * @param b
  406. * buffer to scan.
  407. * @param ptr
  408. * position within buffer to start looking for chrA at.
  409. * @param chrA
  410. * character to find.
  411. * @return new position just before chrA, -1 for not found
  412. */
  413. public static final int prev(final byte[] b, int ptr, final char chrA) {
  414. if (ptr == b.length)
  415. --ptr;
  416. while (ptr >= 0) {
  417. if (b[ptr--] == chrA)
  418. return ptr;
  419. }
  420. return ptr;
  421. }
  422. /**
  423. * Locate the first position before the previous LF.
  424. * <p>
  425. * This method stops on the first '\n' it finds.
  426. *
  427. * @param b
  428. * buffer to scan.
  429. * @param ptr
  430. * position within buffer to start looking for LF at.
  431. * @return new position just before the first LF found, -1 for not found
  432. */
  433. public static final int prevLF(final byte[] b, int ptr) {
  434. return prev(b, ptr, '\n');
  435. }
  436. /**
  437. * Locate the previous position before either the given character or LF.
  438. * <p>
  439. * This method stops on the first match it finds from either chrA or '\n'.
  440. *
  441. * @param b
  442. * buffer to scan.
  443. * @param ptr
  444. * position within buffer to start looking for chrA or LF at.
  445. * @param chrA
  446. * character to find.
  447. * @return new position just before the first chrA or LF to be found, -1 for
  448. * not found
  449. */
  450. public static final int prevLF(final byte[] b, int ptr, final char chrA) {
  451. if (ptr == b.length)
  452. --ptr;
  453. while (ptr >= 0) {
  454. final byte c = b[ptr--];
  455. if (c == chrA || c == '\n')
  456. return ptr;
  457. }
  458. return ptr;
  459. }
  460. /**
  461. * Index the region between <code>[ptr, end)</code> to find line starts.
  462. * <p>
  463. * The returned list is 1 indexed. Index 0 contains
  464. * {@link Integer#MIN_VALUE} to pad the list out.
  465. * <p>
  466. * Using a 1 indexed list means that line numbers can be directly accessed
  467. * from the list, so <code>list.get(1)</code> (aka get line 1) returns
  468. * <code>ptr</code>.
  469. * <p>
  470. * The last element (index <code>map.size()-1</code>) always contains
  471. * <code>end</code>.
  472. *
  473. * @param buf
  474. * buffer to scan.
  475. * @param ptr
  476. * position within the buffer corresponding to the first byte of
  477. * line 1.
  478. * @param end
  479. * 1 past the end of the content within <code>buf</code>.
  480. * @return a line map indexing the start position of each line.
  481. */
  482. public static final IntList lineMap(final byte[] buf, int ptr, int end) {
  483. // Experimentally derived from multiple source repositories
  484. // the average number of bytes/line is 36. Its a rough guess
  485. // to initially size our map close to the target.
  486. //
  487. final IntList map = new IntList((end - ptr) / 36);
  488. map.fillTo(1, Integer.MIN_VALUE);
  489. for (; ptr < end; ptr = nextLF(buf, ptr))
  490. map.add(ptr);
  491. map.add(end);
  492. return map;
  493. }
  494. /**
  495. * Locate the "author " header line data.
  496. *
  497. * @param b
  498. * buffer to scan.
  499. * @param ptr
  500. * position in buffer to start the scan at. Most callers should
  501. * pass 0 to ensure the scan starts from the beginning of the
  502. * commit buffer and does not accidentally look at message body.
  503. * @return position just after the space in "author ", so the first
  504. * character of the author's name. If no author header can be
  505. * located -1 is returned.
  506. */
  507. public static final int author(final byte[] b, int ptr) {
  508. final int sz = b.length;
  509. if (ptr == 0)
  510. ptr += 46; // skip the "tree ..." line.
  511. while (ptr < sz && b[ptr] == 'p')
  512. ptr += 48; // skip this parent.
  513. return match(b, ptr, author);
  514. }
  515. /**
  516. * Locate the "committer " header line data.
  517. *
  518. * @param b
  519. * buffer to scan.
  520. * @param ptr
  521. * position in buffer to start the scan at. Most callers should
  522. * pass 0 to ensure the scan starts from the beginning of the
  523. * commit buffer and does not accidentally look at message body.
  524. * @return position just after the space in "committer ", so the first
  525. * character of the committer's name. If no committer header can be
  526. * located -1 is returned.
  527. */
  528. public static final int committer(final byte[] b, int ptr) {
  529. final int sz = b.length;
  530. if (ptr == 0)
  531. ptr += 46; // skip the "tree ..." line.
  532. while (ptr < sz && b[ptr] == 'p')
  533. ptr += 48; // skip this parent.
  534. if (ptr < sz && b[ptr] == 'a')
  535. ptr = nextLF(b, ptr);
  536. return match(b, ptr, committer);
  537. }
  538. /**
  539. * Locate the "tagger " header line data.
  540. *
  541. * @param b
  542. * buffer to scan.
  543. * @param ptr
  544. * position in buffer to start the scan at. Most callers should
  545. * pass 0 to ensure the scan starts from the beginning of the tag
  546. * buffer and does not accidentally look at message body.
  547. * @return position just after the space in "tagger ", so the first
  548. * character of the tagger's name. If no tagger header can be
  549. * located -1 is returned.
  550. */
  551. public static final int tagger(final byte[] b, int ptr) {
  552. final int sz = b.length;
  553. if (ptr == 0)
  554. ptr += 48; // skip the "object ..." line.
  555. while (ptr < sz) {
  556. if (b[ptr] == '\n')
  557. return -1;
  558. final int m = match(b, ptr, tagger);
  559. if (m >= 0)
  560. return m;
  561. ptr = nextLF(b, ptr);
  562. }
  563. return -1;
  564. }
  565. /**
  566. * Locate the "encoding " header line.
  567. *
  568. * @param b
  569. * buffer to scan.
  570. * @param ptr
  571. * position in buffer to start the scan at. Most callers should
  572. * pass 0 to ensure the scan starts from the beginning of the
  573. * buffer and does not accidentally look at the message body.
  574. * @return position just after the space in "encoding ", so the first
  575. * character of the encoding's name. If no encoding header can be
  576. * located -1 is returned (and UTF-8 should be assumed).
  577. */
  578. public static final int encoding(final byte[] b, int ptr) {
  579. final int sz = b.length;
  580. while (ptr < sz) {
  581. if (b[ptr] == '\n')
  582. return -1;
  583. if (b[ptr] == 'e')
  584. break;
  585. ptr = nextLF(b, ptr);
  586. }
  587. return match(b, ptr, encoding);
  588. }
  589. /**
  590. * Parse the "encoding " header into a character set reference.
  591. * <p>
  592. * Locates the "encoding " header (if present) by first calling
  593. * {@link #encoding(byte[], int)} and then returns the proper character set
  594. * to apply to this buffer to evaluate its contents as character data.
  595. * <p>
  596. * If no encoding header is present, {@link Constants#CHARSET} is assumed.
  597. *
  598. * @param b
  599. * buffer to scan.
  600. * @return the Java character set representation. Never null.
  601. */
  602. public static Charset parseEncoding(final byte[] b) {
  603. final int enc = encoding(b, 0);
  604. if (enc < 0)
  605. return Constants.CHARSET;
  606. final int lf = nextLF(b, enc);
  607. return Charset.forName(decode(Constants.CHARSET, b, enc, lf - 1));
  608. }
  609. /**
  610. * Parse a name line (e.g. author, committer, tagger) into a PersonIdent.
  611. * <p>
  612. * When passing in a value for <code>nameB</code> callers should use the
  613. * return value of {@link #author(byte[], int)} or
  614. * {@link #committer(byte[], int)}, as these methods provide the proper
  615. * position within the buffer.
  616. *
  617. * @param raw
  618. * the buffer to parse character data from.
  619. * @param nameB
  620. * first position of the identity information. This should be the
  621. * first position after the space which delimits the header field
  622. * name (e.g. "author" or "committer") from the rest of the
  623. * identity line.
  624. * @return the parsed identity. Never null.
  625. */
  626. public static PersonIdent parsePersonIdent(final byte[] raw, final int nameB) {
  627. final Charset cs = parseEncoding(raw);
  628. final int emailB = nextLF(raw, nameB, '<');
  629. final int emailE = nextLF(raw, emailB, '>');
  630. final String name = decode(cs, raw, nameB, emailB - 2);
  631. final String email = decode(cs, raw, emailB, emailE - 1);
  632. final MutableInteger ptrout = new MutableInteger();
  633. final long when = parseLongBase10(raw, emailE + 1, ptrout);
  634. final int tz = parseTimeZoneOffset(raw, ptrout.value);
  635. return new PersonIdent(name, email, when * 1000L, tz);
  636. }
  637. /**
  638. * Parse a name data (e.g. as within a reflog) into a PersonIdent.
  639. * <p>
  640. * When passing in a value for <code>nameB</code> callers should use the
  641. * return value of {@link #author(byte[], int)} or
  642. * {@link #committer(byte[], int)}, as these methods provide the proper
  643. * position within the buffer.
  644. *
  645. * @param raw
  646. * the buffer to parse character data from.
  647. * @param nameB
  648. * first position of the identity information. This should be the
  649. * first position after the space which delimits the header field
  650. * name (e.g. "author" or "committer") from the rest of the
  651. * identity line.
  652. * @return the parsed identity. Never null.
  653. */
  654. public static PersonIdent parsePersonIdentOnly(final byte[] raw, final int nameB) {
  655. int stop = nextLF(raw, nameB);
  656. int emailB = nextLF(raw, nameB, '<');
  657. int emailE = nextLF(raw, emailB, '>');
  658. final String name;
  659. final String email;
  660. if (emailE < stop) {
  661. email = decode(raw, emailB, emailE - 1);
  662. } else {
  663. email = "invalid";
  664. }
  665. if (emailB < stop)
  666. name = decode(raw, nameB, emailB - 2);
  667. else
  668. name = decode(raw, nameB, stop);
  669. final MutableInteger ptrout = new MutableInteger();
  670. long when;
  671. int tz;
  672. if (emailE < stop) {
  673. when = parseLongBase10(raw, emailE + 1, ptrout);
  674. tz = parseTimeZoneOffset(raw, ptrout.value);
  675. } else {
  676. when = 0;
  677. tz = 0;
  678. }
  679. return new PersonIdent(name, email, when * 1000L, tz);
  680. }
  681. /**
  682. * Locate the end of a footer line key string.
  683. * <p>
  684. * If the region at {@code raw[ptr]} matches {@code ^[A-Za-z0-9-]+:} (e.g.
  685. * "Signed-off-by: A. U. Thor\n") then this method returns the position of
  686. * the first ':'.
  687. * <p>
  688. * If the region at {@code raw[ptr]} does not match {@code ^[A-Za-z0-9-]+:}
  689. * then this method returns -1.
  690. *
  691. * @param raw
  692. * buffer to scan.
  693. * @param ptr
  694. * first position within raw to consider as a footer line key.
  695. * @return position of the ':' which terminates the footer line key if this
  696. * is otherwise a valid footer line key; otherwise -1.
  697. */
  698. public static int endOfFooterLineKey(final byte[] raw, int ptr) {
  699. try {
  700. for (;;) {
  701. final byte c = raw[ptr];
  702. if (footerLineKeyChars[c] == 0) {
  703. if (c == ':')
  704. return ptr;
  705. return -1;
  706. }
  707. ptr++;
  708. }
  709. } catch (ArrayIndexOutOfBoundsException e) {
  710. return -1;
  711. }
  712. }
  713. /**
  714. * Decode a buffer under UTF-8, if possible.
  715. *
  716. * If the byte stream cannot be decoded that way, the platform default is tried
  717. * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
  718. *
  719. * @param buffer
  720. * buffer to pull raw bytes from.
  721. * @return a string representation of the range <code>[start,end)</code>,
  722. * after decoding the region through the specified character set.
  723. */
  724. public static String decode(final byte[] buffer) {
  725. return decode(buffer, 0, buffer.length);
  726. }
  727. /**
  728. * Decode a buffer under UTF-8, if possible.
  729. *
  730. * If the byte stream cannot be decoded that way, the platform default is
  731. * tried and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
  732. *
  733. * @param buffer
  734. * buffer to pull raw bytes from.
  735. * @param start
  736. * start position in buffer
  737. * @param end
  738. * one position past the last location within the buffer to take
  739. * data from.
  740. * @return a string representation of the range <code>[start,end)</code>,
  741. * after decoding the region through the specified character set.
  742. */
  743. public static String decode(final byte[] buffer, final int start,
  744. final int end) {
  745. return decode(Constants.CHARSET, buffer, start, end);
  746. }
  747. /**
  748. * Decode a buffer under the specified character set if possible.
  749. *
  750. * If the byte stream cannot be decoded that way, the platform default is tried
  751. * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
  752. *
  753. * @param cs
  754. * character set to use when decoding the buffer.
  755. * @param buffer
  756. * buffer to pull raw bytes from.
  757. * @return a string representation of the range <code>[start,end)</code>,
  758. * after decoding the region through the specified character set.
  759. */
  760. public static String decode(final Charset cs, final byte[] buffer) {
  761. return decode(cs, buffer, 0, buffer.length);
  762. }
  763. /**
  764. * Decode a region of the buffer under the specified character set if possible.
  765. *
  766. * If the byte stream cannot be decoded that way, the platform default is tried
  767. * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
  768. *
  769. * @param cs
  770. * character set to use when decoding the buffer.
  771. * @param buffer
  772. * buffer to pull raw bytes from.
  773. * @param start
  774. * first position within the buffer to take data from.
  775. * @param end
  776. * one position past the last location within the buffer to take
  777. * data from.
  778. * @return a string representation of the range <code>[start,end)</code>,
  779. * after decoding the region through the specified character set.
  780. */
  781. public static String decode(final Charset cs, final byte[] buffer,
  782. final int start, final int end) {
  783. try {
  784. return decodeNoFallback(cs, buffer, start, end);
  785. } catch (CharacterCodingException e) {
  786. // Fall back to an ISO-8859-1 style encoding. At least all of
  787. // the bytes will be present in the output.
  788. //
  789. return extractBinaryString(buffer, start, end);
  790. }
  791. }
  792. /**
  793. * Decode a region of the buffer under the specified character set if
  794. * possible.
  795. *
  796. * If the byte stream cannot be decoded that way, the platform default is
  797. * tried and if that too fails, an exception is thrown.
  798. *
  799. * @param cs
  800. * character set to use when decoding the buffer.
  801. * @param buffer
  802. * buffer to pull raw bytes from.
  803. * @param start
  804. * first position within the buffer to take data from.
  805. * @param end
  806. * one position past the last location within the buffer to take
  807. * data from.
  808. * @return a string representation of the range <code>[start,end)</code>,
  809. * after decoding the region through the specified character set.
  810. * @throws CharacterCodingException
  811. * the input is not in any of the tested character sets.
  812. */
  813. public static String decodeNoFallback(final Charset cs,
  814. final byte[] buffer, final int start, final int end)
  815. throws CharacterCodingException {
  816. final ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start);
  817. b.mark();
  818. // Try our built-in favorite. The assumption here is that
  819. // decoding will fail if the data is not actually encoded
  820. // using that encoder.
  821. //
  822. try {
  823. return decode(b, Constants.CHARSET);
  824. } catch (CharacterCodingException e) {
  825. b.reset();
  826. }
  827. if (!cs.equals(Constants.CHARSET)) {
  828. // Try the suggested encoding, it might be right since it was
  829. // provided by the caller.
  830. //
  831. try {
  832. return decode(b, cs);
  833. } catch (CharacterCodingException e) {
  834. b.reset();
  835. }
  836. }
  837. // Try the default character set. A small group of people
  838. // might actually use the same (or very similar) locale.
  839. //
  840. final Charset defcs = Charset.defaultCharset();
  841. if (!defcs.equals(cs) && !defcs.equals(Constants.CHARSET)) {
  842. try {
  843. return decode(b, defcs);
  844. } catch (CharacterCodingException e) {
  845. b.reset();
  846. }
  847. }
  848. throw new CharacterCodingException();
  849. }
  850. /**
  851. * Decode a region of the buffer under the ISO-8859-1 encoding.
  852. *
  853. * Each byte is treated as a single character in the 8859-1 character
  854. * encoding, performing a raw binary->char conversion.
  855. *
  856. * @param buffer
  857. * buffer to pull raw bytes from.
  858. * @param start
  859. * first position within the buffer to take data from.
  860. * @param end
  861. * one position past the last location within the buffer to take
  862. * data from.
  863. * @return a string representation of the range <code>[start,end)</code>.
  864. */
  865. public static String extractBinaryString(final byte[] buffer,
  866. final int start, final int end) {
  867. final StringBuilder r = new StringBuilder(end - start);
  868. for (int i = start; i < end; i++)
  869. r.append((char) (buffer[i] & 0xff));
  870. return r.toString();
  871. }
  872. private static String decode(final ByteBuffer b, final Charset charset)
  873. throws CharacterCodingException {
  874. final CharsetDecoder d = charset.newDecoder();
  875. d.onMalformedInput(CodingErrorAction.REPORT);
  876. d.onUnmappableCharacter(CodingErrorAction.REPORT);
  877. return d.decode(b).toString();
  878. }
  879. /**
  880. * Locate the position of the commit message body.
  881. *
  882. * @param b
  883. * buffer to scan.
  884. * @param ptr
  885. * position in buffer to start the scan at. Most callers should
  886. * pass 0 to ensure the scan starts from the beginning of the
  887. * commit buffer.
  888. * @return position of the user's message buffer.
  889. */
  890. public static final int commitMessage(final byte[] b, int ptr) {
  891. final int sz = b.length;
  892. if (ptr == 0)
  893. ptr += 46; // skip the "tree ..." line.
  894. while (ptr < sz && b[ptr] == 'p')
  895. ptr += 48; // skip this parent.
  896. // Skip any remaining header lines, ignoring what their actual
  897. // header line type is. This is identical to the logic for a tag.
  898. //
  899. return tagMessage(b, ptr);
  900. }
  901. /**
  902. * Locate the position of the tag message body.
  903. *
  904. * @param b
  905. * buffer to scan.
  906. * @param ptr
  907. * position in buffer to start the scan at. Most callers should
  908. * pass 0 to ensure the scan starts from the beginning of the tag
  909. * buffer.
  910. * @return position of the user's message buffer.
  911. */
  912. public static final int tagMessage(final byte[] b, int ptr) {
  913. final int sz = b.length;
  914. if (ptr == 0)
  915. ptr += 48; // skip the "object ..." line.
  916. while (ptr < sz && b[ptr] != '\n')
  917. ptr = nextLF(b, ptr);
  918. if (ptr < sz && b[ptr] == '\n')
  919. return ptr + 1;
  920. return -1;
  921. }
  922. /**
  923. * Locate the end of a paragraph.
  924. * <p>
  925. * A paragraph is ended by two consecutive LF bytes.
  926. *
  927. * @param b
  928. * buffer to scan.
  929. * @param start
  930. * position in buffer to start the scan at. Most callers will
  931. * want to pass the first position of the commit message (as
  932. * found by {@link #commitMessage(byte[], int)}.
  933. * @return position of the LF at the end of the paragraph;
  934. * <code>b.length</code> if no paragraph end could be located.
  935. */
  936. public static final int endOfParagraph(final byte[] b, final int start) {
  937. int ptr = start;
  938. final int sz = b.length;
  939. while (ptr < sz && b[ptr] != '\n')
  940. ptr = nextLF(b, ptr);
  941. while (0 < ptr && start < ptr && b[ptr - 1] == '\n')
  942. ptr--;
  943. return ptr;
  944. }
  945. private RawParseUtils() {
  946. // Don't create instances of a static only utility.
  947. }
  948. }