You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

RawParseUtils.java 37KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251
  1. /*
  2. * Copyright (C) 2008-2009, Google Inc.
  3. * Copyright (C) 2006-2008, Shawn O. Pearce <spearce@spearce.org>
  4. * and other copyright owners as documented in the project's IP log.
  5. *
  6. * This program and the accompanying materials are made available
  7. * under the terms of the Eclipse Distribution License v1.0 which
  8. * accompanies this distribution, is reproduced below, and is
  9. * available at http://www.eclipse.org/org/documents/edl-v10.php
  10. *
  11. * All rights reserved.
  12. *
  13. * Redistribution and use in source and binary forms, with or
  14. * without modification, are permitted provided that the following
  15. * conditions are met:
  16. *
  17. * - Redistributions of source code must retain the above copyright
  18. * notice, this list of conditions and the following disclaimer.
  19. *
  20. * - Redistributions in binary form must reproduce the above
  21. * copyright notice, this list of conditions and the following
  22. * disclaimer in the documentation and/or other materials provided
  23. * with the distribution.
  24. *
  25. * - Neither the name of the Eclipse Foundation, Inc. nor the
  26. * names of its contributors may be used to endorse or promote
  27. * products derived from this software without specific prior
  28. * written permission.
  29. *
  30. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
  31. * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
  32. * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  33. * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  34. * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
  35. * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  36. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  37. * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  38. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  39. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  40. * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  41. * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  42. * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  43. */
  44. package org.eclipse.jgit.util;
  45. import static java.nio.charset.StandardCharsets.ISO_8859_1;
  46. import static java.nio.charset.StandardCharsets.UTF_8;
  47. import static org.eclipse.jgit.lib.ObjectChecker.author;
  48. import static org.eclipse.jgit.lib.ObjectChecker.committer;
  49. import static org.eclipse.jgit.lib.ObjectChecker.encoding;
  50. import static org.eclipse.jgit.lib.ObjectChecker.tagger;
  51. import java.nio.ByteBuffer;
  52. import java.nio.charset.CharacterCodingException;
  53. import java.nio.charset.Charset;
  54. import java.nio.charset.CharsetDecoder;
  55. import java.nio.charset.CodingErrorAction;
  56. import java.nio.charset.IllegalCharsetNameException;
  57. import java.nio.charset.UnsupportedCharsetException;
  58. import java.util.Arrays;
  59. import java.util.HashMap;
  60. import java.util.Map;
  61. import org.eclipse.jgit.annotations.Nullable;
  62. import org.eclipse.jgit.lib.Constants;
  63. import org.eclipse.jgit.lib.PersonIdent;
  64. /** Handy utility functions to parse raw object contents. */
  65. public final class RawParseUtils {
  66. /**
  67. * UTF-8 charset constant.
  68. *
  69. * @since 2.2
  70. */
  71. public static final Charset UTF8_CHARSET = UTF_8;
  72. private static final byte[] digits10;
  73. private static final byte[] digits16;
  74. private static final byte[] footerLineKeyChars;
  75. private static final Map<String, Charset> encodingAliases;
  76. static {
  77. encodingAliases = new HashMap<>();
  78. encodingAliases.put("latin-1", ISO_8859_1); //$NON-NLS-1$
  79. encodingAliases.put("iso-latin-1", ISO_8859_1); //$NON-NLS-1$
  80. digits10 = new byte['9' + 1];
  81. Arrays.fill(digits10, (byte) -1);
  82. for (char i = '0'; i <= '9'; i++)
  83. digits10[i] = (byte) (i - '0');
  84. digits16 = new byte['f' + 1];
  85. Arrays.fill(digits16, (byte) -1);
  86. for (char i = '0'; i <= '9'; i++)
  87. digits16[i] = (byte) (i - '0');
  88. for (char i = 'a'; i <= 'f'; i++)
  89. digits16[i] = (byte) ((i - 'a') + 10);
  90. for (char i = 'A'; i <= 'F'; i++)
  91. digits16[i] = (byte) ((i - 'A') + 10);
  92. footerLineKeyChars = new byte['z' + 1];
  93. footerLineKeyChars['-'] = 1;
  94. for (char i = '0'; i <= '9'; i++)
  95. footerLineKeyChars[i] = 1;
  96. for (char i = 'A'; i <= 'Z'; i++)
  97. footerLineKeyChars[i] = 1;
  98. for (char i = 'a'; i <= 'z'; i++)
  99. footerLineKeyChars[i] = 1;
  100. }
  101. /**
  102. * Determine if b[ptr] matches src.
  103. *
  104. * @param b
  105. * the buffer to scan.
  106. * @param ptr
  107. * first position within b, this should match src[0].
  108. * @param src
  109. * the buffer to test for equality with b.
  110. * @return ptr + src.length if b[ptr..src.length] == src; else -1.
  111. */
  112. public static final int match(final byte[] b, int ptr, final byte[] src) {
  113. if (ptr + src.length > b.length)
  114. return -1;
  115. for (int i = 0; i < src.length; i++, ptr++)
  116. if (b[ptr] != src[i])
  117. return -1;
  118. return ptr;
  119. }
  120. private static final byte[] base10byte = { '0', '1', '2', '3', '4', '5',
  121. '6', '7', '8', '9' };
  122. /**
  123. * Format a base 10 numeric into a temporary buffer.
  124. * <p>
  125. * Formatting is performed backwards. The method starts at offset
  126. * <code>o-1</code> and ends at <code>o-1-digits</code>, where
  127. * <code>digits</code> is the number of positions necessary to store the
  128. * base 10 value.
  129. * <p>
  130. * The argument and return values from this method make it easy to chain
  131. * writing, for example:
  132. * </p>
  133. *
  134. * <pre>
  135. * final byte[] tmp = new byte[64];
  136. * int ptr = tmp.length;
  137. * tmp[--ptr] = '\n';
  138. * ptr = RawParseUtils.formatBase10(tmp, ptr, 32);
  139. * tmp[--ptr] = ' ';
  140. * ptr = RawParseUtils.formatBase10(tmp, ptr, 18);
  141. * tmp[--ptr] = 0;
  142. * final String str = new String(tmp, ptr, tmp.length - ptr);
  143. * </pre>
  144. *
  145. * @param b
  146. * buffer to write into.
  147. * @param o
  148. * one offset past the location where writing will begin; writing
  149. * proceeds towards lower index values.
  150. * @param value
  151. * the value to store.
  152. * @return the new offset value <code>o</code>. This is the position of
  153. * the last byte written. Additional writing should start at one
  154. * position earlier.
  155. */
  156. public static int formatBase10(final byte[] b, int o, int value) {
  157. if (value == 0) {
  158. b[--o] = '0';
  159. return o;
  160. }
  161. final boolean isneg = value < 0;
  162. if (isneg)
  163. value = -value;
  164. while (value != 0) {
  165. b[--o] = base10byte[value % 10];
  166. value /= 10;
  167. }
  168. if (isneg)
  169. b[--o] = '-';
  170. return o;
  171. }
  172. /**
  173. * Parse a base 10 numeric from a sequence of ASCII digits into an int.
  174. * <p>
  175. * Digit sequences can begin with an optional run of spaces before the
  176. * sequence, and may start with a '+' or a '-' to indicate sign position.
  177. * Any other characters will cause the method to stop and return the current
  178. * result to the caller.
  179. *
  180. * @param b
  181. * buffer to scan.
  182. * @param ptr
  183. * position within buffer to start parsing digits at.
  184. * @param ptrResult
  185. * optional location to return the new ptr value through. If null
  186. * the ptr value will be discarded.
  187. * @return the value at this location; 0 if the location is not a valid
  188. * numeric.
  189. */
  190. public static final int parseBase10(final byte[] b, int ptr,
  191. final MutableInteger ptrResult) {
  192. int r = 0;
  193. int sign = 0;
  194. try {
  195. final int sz = b.length;
  196. while (ptr < sz && b[ptr] == ' ')
  197. ptr++;
  198. if (ptr >= sz)
  199. return 0;
  200. switch (b[ptr]) {
  201. case '-':
  202. sign = -1;
  203. ptr++;
  204. break;
  205. case '+':
  206. ptr++;
  207. break;
  208. }
  209. while (ptr < sz) {
  210. final byte v = digits10[b[ptr]];
  211. if (v < 0)
  212. break;
  213. r = (r * 10) + v;
  214. ptr++;
  215. }
  216. } catch (ArrayIndexOutOfBoundsException e) {
  217. // Not a valid digit.
  218. }
  219. if (ptrResult != null)
  220. ptrResult.value = ptr;
  221. return sign < 0 ? -r : r;
  222. }
  223. /**
  224. * Parse a base 10 numeric from a sequence of ASCII digits into a long.
  225. * <p>
  226. * Digit sequences can begin with an optional run of spaces before the
  227. * sequence, and may start with a '+' or a '-' to indicate sign position.
  228. * Any other characters will cause the method to stop and return the current
  229. * result to the caller.
  230. *
  231. * @param b
  232. * buffer to scan.
  233. * @param ptr
  234. * position within buffer to start parsing digits at.
  235. * @param ptrResult
  236. * optional location to return the new ptr value through. If null
  237. * the ptr value will be discarded.
  238. * @return the value at this location; 0 if the location is not a valid
  239. * numeric.
  240. */
  241. public static final long parseLongBase10(final byte[] b, int ptr,
  242. final MutableInteger ptrResult) {
  243. long r = 0;
  244. int sign = 0;
  245. try {
  246. final int sz = b.length;
  247. while (ptr < sz && b[ptr] == ' ')
  248. ptr++;
  249. if (ptr >= sz)
  250. return 0;
  251. switch (b[ptr]) {
  252. case '-':
  253. sign = -1;
  254. ptr++;
  255. break;
  256. case '+':
  257. ptr++;
  258. break;
  259. }
  260. while (ptr < sz) {
  261. final byte v = digits10[b[ptr]];
  262. if (v < 0)
  263. break;
  264. r = (r * 10) + v;
  265. ptr++;
  266. }
  267. } catch (ArrayIndexOutOfBoundsException e) {
  268. // Not a valid digit.
  269. }
  270. if (ptrResult != null)
  271. ptrResult.value = ptr;
  272. return sign < 0 ? -r : r;
  273. }
  274. /**
  275. * Parse 4 character base 16 (hex) formatted string to unsigned integer.
  276. * <p>
  277. * The number is read in network byte order, that is, most significant
  278. * nybble first.
  279. *
  280. * @param bs
  281. * buffer to parse digits from; positions {@code [p, p+4)} will
  282. * be parsed.
  283. * @param p
  284. * first position within the buffer to parse.
  285. * @return the integer value.
  286. * @throws ArrayIndexOutOfBoundsException
  287. * if the string is not hex formatted.
  288. */
  289. public static final int parseHexInt16(final byte[] bs, final int p) {
  290. int r = digits16[bs[p]] << 4;
  291. r |= digits16[bs[p + 1]];
  292. r <<= 4;
  293. r |= digits16[bs[p + 2]];
  294. r <<= 4;
  295. r |= digits16[bs[p + 3]];
  296. if (r < 0)
  297. throw new ArrayIndexOutOfBoundsException();
  298. return r;
  299. }
  300. /**
  301. * Parse 8 character base 16 (hex) formatted string to unsigned integer.
  302. * <p>
  303. * The number is read in network byte order, that is, most significant
  304. * nybble first.
  305. *
  306. * @param bs
  307. * buffer to parse digits from; positions {@code [p, p+8)} will
  308. * be parsed.
  309. * @param p
  310. * first position within the buffer to parse.
  311. * @return the integer value.
  312. * @throws ArrayIndexOutOfBoundsException
  313. * if the string is not hex formatted.
  314. */
  315. public static final int parseHexInt32(final byte[] bs, final int p) {
  316. int r = digits16[bs[p]] << 4;
  317. r |= digits16[bs[p + 1]];
  318. r <<= 4;
  319. r |= digits16[bs[p + 2]];
  320. r <<= 4;
  321. r |= digits16[bs[p + 3]];
  322. r <<= 4;
  323. r |= digits16[bs[p + 4]];
  324. r <<= 4;
  325. r |= digits16[bs[p + 5]];
  326. r <<= 4;
  327. r |= digits16[bs[p + 6]];
  328. final int last = digits16[bs[p + 7]];
  329. if (r < 0 || last < 0)
  330. throw new ArrayIndexOutOfBoundsException();
  331. return (r << 4) | last;
  332. }
  333. /**
  334. * Parse 16 character base 16 (hex) formatted string to unsigned long.
  335. * <p>
  336. * The number is read in network byte order, that is, most significant
  337. * nibble first.
  338. *
  339. * @param bs
  340. * buffer to parse digits from; positions {@code [p, p+16)} will
  341. * be parsed.
  342. * @param p
  343. * first position within the buffer to parse.
  344. * @return the integer value.
  345. * @throws ArrayIndexOutOfBoundsException
  346. * if the string is not hex formatted.
  347. * @since 4.3
  348. */
  349. public static final long parseHexInt64(final byte[] bs, final int p) {
  350. long r = digits16[bs[p]] << 4;
  351. r |= digits16[bs[p + 1]];
  352. r <<= 4;
  353. r |= digits16[bs[p + 2]];
  354. r <<= 4;
  355. r |= digits16[bs[p + 3]];
  356. r <<= 4;
  357. r |= digits16[bs[p + 4]];
  358. r <<= 4;
  359. r |= digits16[bs[p + 5]];
  360. r <<= 4;
  361. r |= digits16[bs[p + 6]];
  362. r <<= 4;
  363. r |= digits16[bs[p + 7]];
  364. r <<= 4;
  365. r |= digits16[bs[p + 8]];
  366. r <<= 4;
  367. r |= digits16[bs[p + 9]];
  368. r <<= 4;
  369. r |= digits16[bs[p + 10]];
  370. r <<= 4;
  371. r |= digits16[bs[p + 11]];
  372. r <<= 4;
  373. r |= digits16[bs[p + 12]];
  374. r <<= 4;
  375. r |= digits16[bs[p + 13]];
  376. r <<= 4;
  377. r |= digits16[bs[p + 14]];
  378. final int last = digits16[bs[p + 15]];
  379. if (r < 0 || last < 0)
  380. throw new ArrayIndexOutOfBoundsException();
  381. return (r << 4) | last;
  382. }
  383. /**
  384. * Parse a single hex digit to its numeric value (0-15).
  385. *
  386. * @param digit
  387. * hex character to parse.
  388. * @return numeric value, in the range 0-15.
  389. * @throws ArrayIndexOutOfBoundsException
  390. * if the input digit is not a valid hex digit.
  391. */
  392. public static final int parseHexInt4(final byte digit) {
  393. final byte r = digits16[digit];
  394. if (r < 0)
  395. throw new ArrayIndexOutOfBoundsException();
  396. return r;
  397. }
  398. /**
  399. * Parse a Git style timezone string.
  400. * <p>
  401. * The sequence "-0315" will be parsed as the numeric value -195, as the
  402. * lower two positions count minutes, not 100ths of an hour.
  403. *
  404. * @param b
  405. * buffer to scan.
  406. * @param ptr
  407. * position within buffer to start parsing digits at.
  408. * @return the timezone at this location, expressed in minutes.
  409. */
  410. public static final int parseTimeZoneOffset(final byte[] b, int ptr) {
  411. return parseTimeZoneOffset(b, ptr, null);
  412. }
  413. /**
  414. * Parse a Git style timezone string.
  415. * <p>
  416. * The sequence "-0315" will be parsed as the numeric value -195, as the
  417. * lower two positions count minutes, not 100ths of an hour.
  418. *
  419. * @param b
  420. * buffer to scan.
  421. * @param ptr
  422. * position within buffer to start parsing digits at.
  423. * @param ptrResult
  424. * optional location to return the new ptr value through. If null
  425. * the ptr value will be discarded.
  426. * @return the timezone at this location, expressed in minutes.
  427. * @since 4.1
  428. */
  429. public static final int parseTimeZoneOffset(final byte[] b, int ptr,
  430. MutableInteger ptrResult) {
  431. final int v = parseBase10(b, ptr, ptrResult);
  432. final int tzMins = v % 100;
  433. final int tzHours = v / 100;
  434. return tzHours * 60 + tzMins;
  435. }
  436. /**
  437. * Locate the first position after a given character.
  438. *
  439. * @param b
  440. * buffer to scan.
  441. * @param ptr
  442. * position within buffer to start looking for chrA at.
  443. * @param chrA
  444. * character to find.
  445. * @return new position just after chrA.
  446. */
  447. public static final int next(final byte[] b, int ptr, final char chrA) {
  448. final int sz = b.length;
  449. while (ptr < sz) {
  450. if (b[ptr++] == chrA)
  451. return ptr;
  452. }
  453. return ptr;
  454. }
  455. /**
  456. * Locate the first position after the next LF.
  457. * <p>
  458. * This method stops on the first '\n' it finds.
  459. *
  460. * @param b
  461. * buffer to scan.
  462. * @param ptr
  463. * position within buffer to start looking for LF at.
  464. * @return new position just after the first LF found.
  465. */
  466. public static final int nextLF(final byte[] b, int ptr) {
  467. return next(b, ptr, '\n');
  468. }
  469. /**
  470. * Locate the first position after either the given character or LF.
  471. * <p>
  472. * This method stops on the first match it finds from either chrA or '\n'.
  473. *
  474. * @param b
  475. * buffer to scan.
  476. * @param ptr
  477. * position within buffer to start looking for chrA or LF at.
  478. * @param chrA
  479. * character to find.
  480. * @return new position just after the first chrA or LF to be found.
  481. */
  482. public static final int nextLF(final byte[] b, int ptr, final char chrA) {
  483. final int sz = b.length;
  484. while (ptr < sz) {
  485. final byte c = b[ptr++];
  486. if (c == chrA || c == '\n')
  487. return ptr;
  488. }
  489. return ptr;
  490. }
  491. /**
  492. * Locate the first position before a given character.
  493. *
  494. * @param b
  495. * buffer to scan.
  496. * @param ptr
  497. * position within buffer to start looking for chrA at.
  498. * @param chrA
  499. * character to find.
  500. * @return new position just before chrA, -1 for not found
  501. */
  502. public static final int prev(final byte[] b, int ptr, final char chrA) {
  503. if (ptr == b.length)
  504. --ptr;
  505. while (ptr >= 0) {
  506. if (b[ptr--] == chrA)
  507. return ptr;
  508. }
  509. return ptr;
  510. }
  511. /**
  512. * Locate the first position before the previous LF.
  513. * <p>
  514. * This method stops on the first '\n' it finds.
  515. *
  516. * @param b
  517. * buffer to scan.
  518. * @param ptr
  519. * position within buffer to start looking for LF at.
  520. * @return new position just before the first LF found, -1 for not found
  521. */
  522. public static final int prevLF(final byte[] b, int ptr) {
  523. return prev(b, ptr, '\n');
  524. }
  525. /**
  526. * Locate the previous position before either the given character or LF.
  527. * <p>
  528. * This method stops on the first match it finds from either chrA or '\n'.
  529. *
  530. * @param b
  531. * buffer to scan.
  532. * @param ptr
  533. * position within buffer to start looking for chrA or LF at.
  534. * @param chrA
  535. * character to find.
  536. * @return new position just before the first chrA or LF to be found, -1 for
  537. * not found
  538. */
  539. public static final int prevLF(final byte[] b, int ptr, final char chrA) {
  540. if (ptr == b.length)
  541. --ptr;
  542. while (ptr >= 0) {
  543. final byte c = b[ptr--];
  544. if (c == chrA || c == '\n')
  545. return ptr;
  546. }
  547. return ptr;
  548. }
  549. /**
  550. * Index the region between <code>[ptr, end)</code> to find line starts.
  551. * <p>
  552. * The returned list is 1 indexed. Index 0 contains
  553. * {@link Integer#MIN_VALUE} to pad the list out.
  554. * <p>
  555. * Using a 1 indexed list means that line numbers can be directly accessed
  556. * from the list, so <code>list.get(1)</code> (aka get line 1) returns
  557. * <code>ptr</code>.
  558. * <p>
  559. * The last element (index <code>map.size()-1</code>) always contains
  560. * <code>end</code>.
  561. * <p>
  562. * If the data contains a '\0' anywhere, the whole region is considered binary
  563. * and a LineMap corresponding to a single line is returned.
  564. * </p>
  565. *
  566. * @param buf
  567. * buffer to scan.
  568. * @param ptr
  569. * position within the buffer corresponding to the first byte of
  570. * line 1.
  571. * @param end
  572. * 1 past the end of the content within <code>buf</code>.
  573. * @return a line map indexing the start position of each line.
  574. */
  575. public static final IntList lineMap(final byte[] buf, int ptr, int end) {
  576. int start = ptr;
  577. // Experimentally derived from multiple source repositories
  578. // the average number of bytes/line is 36. Its a rough guess
  579. // to initially size our map close to the target.
  580. IntList map = new IntList((end - ptr) / 36);
  581. map.add(Integer.MIN_VALUE);
  582. boolean foundLF = true;
  583. for (; ptr < end; ptr++) {
  584. if (foundLF) {
  585. map.add(ptr);
  586. }
  587. if (buf[ptr] == '\0') {
  588. // binary data.
  589. map = new IntList(3);
  590. map.add(Integer.MIN_VALUE);
  591. map.add(start);
  592. break;
  593. }
  594. foundLF = (buf[ptr] == '\n');
  595. }
  596. map.add(end);
  597. return map;
  598. }
  599. /**
  600. * Locate the "author " header line data.
  601. *
  602. * @param b
  603. * buffer to scan.
  604. * @param ptr
  605. * position in buffer to start the scan at. Most callers should
  606. * pass 0 to ensure the scan starts from the beginning of the
  607. * commit buffer and does not accidentally look at message body.
  608. * @return position just after the space in "author ", so the first
  609. * character of the author's name. If no author header can be
  610. * located -1 is returned.
  611. */
  612. public static final int author(final byte[] b, int ptr) {
  613. final int sz = b.length;
  614. if (ptr == 0)
  615. ptr += 46; // skip the "tree ..." line.
  616. while (ptr < sz && b[ptr] == 'p')
  617. ptr += 48; // skip this parent.
  618. return match(b, ptr, author);
  619. }
  620. /**
  621. * Locate the "committer " header line data.
  622. *
  623. * @param b
  624. * buffer to scan.
  625. * @param ptr
  626. * position in buffer to start the scan at. Most callers should
  627. * pass 0 to ensure the scan starts from the beginning of the
  628. * commit buffer and does not accidentally look at message body.
  629. * @return position just after the space in "committer ", so the first
  630. * character of the committer's name. If no committer header can be
  631. * located -1 is returned.
  632. */
  633. public static final int committer(final byte[] b, int ptr) {
  634. final int sz = b.length;
  635. if (ptr == 0)
  636. ptr += 46; // skip the "tree ..." line.
  637. while (ptr < sz && b[ptr] == 'p')
  638. ptr += 48; // skip this parent.
  639. if (ptr < sz && b[ptr] == 'a')
  640. ptr = nextLF(b, ptr);
  641. return match(b, ptr, committer);
  642. }
  643. /**
  644. * Locate the "tagger " header line data.
  645. *
  646. * @param b
  647. * buffer to scan.
  648. * @param ptr
  649. * position in buffer to start the scan at. Most callers should
  650. * pass 0 to ensure the scan starts from the beginning of the tag
  651. * buffer and does not accidentally look at message body.
  652. * @return position just after the space in "tagger ", so the first
  653. * character of the tagger's name. If no tagger header can be
  654. * located -1 is returned.
  655. */
  656. public static final int tagger(final byte[] b, int ptr) {
  657. final int sz = b.length;
  658. if (ptr == 0)
  659. ptr += 48; // skip the "object ..." line.
  660. while (ptr < sz) {
  661. if (b[ptr] == '\n')
  662. return -1;
  663. final int m = match(b, ptr, tagger);
  664. if (m >= 0)
  665. return m;
  666. ptr = nextLF(b, ptr);
  667. }
  668. return -1;
  669. }
  670. /**
  671. * Locate the "encoding " header line.
  672. *
  673. * @param b
  674. * buffer to scan.
  675. * @param ptr
  676. * position in buffer to start the scan at. Most callers should
  677. * pass 0 to ensure the scan starts from the beginning of the
  678. * buffer and does not accidentally look at the message body.
  679. * @return position just after the space in "encoding ", so the first
  680. * character of the encoding's name. If no encoding header can be
  681. * located -1 is returned (and UTF-8 should be assumed).
  682. */
  683. public static final int encoding(final byte[] b, int ptr) {
  684. final int sz = b.length;
  685. while (ptr < sz) {
  686. if (b[ptr] == '\n')
  687. return -1;
  688. if (b[ptr] == 'e')
  689. break;
  690. ptr = nextLF(b, ptr);
  691. }
  692. return match(b, ptr, encoding);
  693. }
  694. /**
  695. * Parse the "encoding " header as a string.
  696. * <p>
  697. * Locates the "encoding " header (if present) and returns its value.
  698. *
  699. * @param b
  700. * buffer to scan.
  701. * @return the encoding header as specified in the commit; null if the
  702. * header was not present and should be assumed.
  703. * @since 4.2
  704. */
  705. @Nullable
  706. public static String parseEncodingName(final byte[] b) {
  707. int enc = encoding(b, 0);
  708. if (enc < 0) {
  709. return null;
  710. }
  711. int lf = nextLF(b, enc);
  712. return decode(UTF_8, b, enc, lf - 1);
  713. }
  714. /**
  715. * Parse the "encoding " header into a character set reference.
  716. * <p>
  717. * Locates the "encoding " header (if present) by first calling
  718. * {@link #encoding(byte[], int)} and then returns the proper character set
  719. * to apply to this buffer to evaluate its contents as character data.
  720. * <p>
  721. * If no encoding header is present {@code UTF-8} is assumed.
  722. *
  723. * @param b
  724. * buffer to scan.
  725. * @return the Java character set representation. Never null.
  726. * @throws IllegalCharsetNameException
  727. * if the character set requested by the encoding header is
  728. * malformed and unsupportable.
  729. * @throws UnsupportedCharsetException
  730. * if the JRE does not support the character set requested by
  731. * the encoding header.
  732. */
  733. public static Charset parseEncoding(final byte[] b) {
  734. String enc = parseEncodingName(b);
  735. if (enc == null) {
  736. return UTF_8;
  737. }
  738. String name = enc.trim();
  739. try {
  740. return Charset.forName(name);
  741. } catch (IllegalCharsetNameException
  742. | UnsupportedCharsetException badName) {
  743. Charset aliased = charsetForAlias(name);
  744. if (aliased != null) {
  745. return aliased;
  746. }
  747. throw badName;
  748. }
  749. }
  750. /**
  751. * Parse a name string (e.g. author, committer, tagger) into a PersonIdent.
  752. * <p>
  753. * Leading spaces won't be trimmed from the string, i.e. will show up in the
  754. * parsed name afterwards.
  755. *
  756. * @param in
  757. * the string to parse a name from.
  758. * @return the parsed identity or null in case the identity could not be
  759. * parsed.
  760. */
  761. public static PersonIdent parsePersonIdent(final String in) {
  762. return parsePersonIdent(Constants.encode(in), 0);
  763. }
  764. /**
  765. * Parse a name line (e.g. author, committer, tagger) into a PersonIdent.
  766. * <p>
  767. * When passing in a value for <code>nameB</code> callers should use the
  768. * return value of {@link #author(byte[], int)} or
  769. * {@link #committer(byte[], int)}, as these methods provide the proper
  770. * position within the buffer.
  771. *
  772. * @param raw
  773. * the buffer to parse character data from.
  774. * @param nameB
  775. * first position of the identity information. This should be the
  776. * first position after the space which delimits the header field
  777. * name (e.g. "author" or "committer") from the rest of the
  778. * identity line.
  779. * @return the parsed identity or null in case the identity could not be
  780. * parsed.
  781. */
  782. public static PersonIdent parsePersonIdent(final byte[] raw, final int nameB) {
  783. Charset cs;
  784. try {
  785. cs = parseEncoding(raw);
  786. } catch (IllegalCharsetNameException | UnsupportedCharsetException e) {
  787. // Assume UTF-8 for person identities, usually this is correct.
  788. // If not decode() will fall back to the ISO-8859-1 encoding.
  789. cs = UTF_8;
  790. }
  791. final int emailB = nextLF(raw, nameB, '<');
  792. final int emailE = nextLF(raw, emailB, '>');
  793. if (emailB >= raw.length || raw[emailB] == '\n' ||
  794. (emailE >= raw.length - 1 && raw[emailE - 1] != '>'))
  795. return null;
  796. final int nameEnd = emailB - 2 >= nameB && raw[emailB - 2] == ' ' ?
  797. emailB - 2 : emailB - 1;
  798. final String name = decode(cs, raw, nameB, nameEnd);
  799. final String email = decode(cs, raw, emailB, emailE - 1);
  800. // Start searching from end of line, as after first name-email pair,
  801. // another name-email pair may occur. We will ignore all kinds of
  802. // "junk" following the first email.
  803. //
  804. // We've to use (emailE - 1) for the case that raw[email] is LF,
  805. // otherwise we would run too far. "-2" is necessary to position
  806. // before the LF in case of LF termination resp. the penultimate
  807. // character if there is no trailing LF.
  808. final int tzBegin = lastIndexOfTrim(raw, ' ',
  809. nextLF(raw, emailE - 1) - 2) + 1;
  810. if (tzBegin <= emailE) // No time/zone, still valid
  811. return new PersonIdent(name, email, 0, 0);
  812. final int whenBegin = Math.max(emailE,
  813. lastIndexOfTrim(raw, ' ', tzBegin - 1) + 1);
  814. if (whenBegin >= tzBegin - 1) // No time/zone, still valid
  815. return new PersonIdent(name, email, 0, 0);
  816. final long when = parseLongBase10(raw, whenBegin, null);
  817. final int tz = parseTimeZoneOffset(raw, tzBegin);
  818. return new PersonIdent(name, email, when * 1000L, tz);
  819. }
  820. /**
  821. * Parse a name data (e.g. as within a reflog) into a PersonIdent.
  822. * <p>
  823. * When passing in a value for <code>nameB</code> callers should use the
  824. * return value of {@link #author(byte[], int)} or
  825. * {@link #committer(byte[], int)}, as these methods provide the proper
  826. * position within the buffer.
  827. *
  828. * @param raw
  829. * the buffer to parse character data from.
  830. * @param nameB
  831. * first position of the identity information. This should be the
  832. * first position after the space which delimits the header field
  833. * name (e.g. "author" or "committer") from the rest of the
  834. * identity line.
  835. * @return the parsed identity. Never null.
  836. */
  837. public static PersonIdent parsePersonIdentOnly(final byte[] raw,
  838. final int nameB) {
  839. int stop = nextLF(raw, nameB);
  840. int emailB = nextLF(raw, nameB, '<');
  841. int emailE = nextLF(raw, emailB, '>');
  842. final String name;
  843. final String email;
  844. if (emailE < stop) {
  845. email = decode(raw, emailB, emailE - 1);
  846. } else {
  847. email = "invalid"; //$NON-NLS-1$
  848. }
  849. if (emailB < stop)
  850. name = decode(raw, nameB, emailB - 2);
  851. else
  852. name = decode(raw, nameB, stop);
  853. final MutableInteger ptrout = new MutableInteger();
  854. long when;
  855. int tz;
  856. if (emailE < stop) {
  857. when = parseLongBase10(raw, emailE + 1, ptrout);
  858. tz = parseTimeZoneOffset(raw, ptrout.value);
  859. } else {
  860. when = 0;
  861. tz = 0;
  862. }
  863. return new PersonIdent(name, email, when * 1000L, tz);
  864. }
  865. /**
  866. * Locate the end of a footer line key string.
  867. * <p>
  868. * If the region at {@code raw[ptr]} matches {@code ^[A-Za-z0-9-]+:} (e.g.
  869. * "Signed-off-by: A. U. Thor\n") then this method returns the position of
  870. * the first ':'.
  871. * <p>
  872. * If the region at {@code raw[ptr]} does not match {@code ^[A-Za-z0-9-]+:}
  873. * then this method returns -1.
  874. *
  875. * @param raw
  876. * buffer to scan.
  877. * @param ptr
  878. * first position within raw to consider as a footer line key.
  879. * @return position of the ':' which terminates the footer line key if this
  880. * is otherwise a valid footer line key; otherwise -1.
  881. */
  882. public static int endOfFooterLineKey(final byte[] raw, int ptr) {
  883. try {
  884. for (;;) {
  885. final byte c = raw[ptr];
  886. if (footerLineKeyChars[c] == 0) {
  887. if (c == ':')
  888. return ptr;
  889. return -1;
  890. }
  891. ptr++;
  892. }
  893. } catch (ArrayIndexOutOfBoundsException e) {
  894. return -1;
  895. }
  896. }
  897. /**
  898. * Decode a buffer under UTF-8, if possible.
  899. *
  900. * If the byte stream cannot be decoded that way, the platform default is tried
  901. * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
  902. *
  903. * @param buffer
  904. * buffer to pull raw bytes from.
  905. * @return a string representation of the range <code>[start,end)</code>,
  906. * after decoding the region through the specified character set.
  907. */
  908. public static String decode(final byte[] buffer) {
  909. return decode(buffer, 0, buffer.length);
  910. }
  911. /**
  912. * Decode a buffer under UTF-8, if possible.
  913. *
  914. * If the byte stream cannot be decoded that way, the platform default is
  915. * tried and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
  916. *
  917. * @param buffer
  918. * buffer to pull raw bytes from.
  919. * @param start
  920. * start position in buffer
  921. * @param end
  922. * one position past the last location within the buffer to take
  923. * data from.
  924. * @return a string representation of the range <code>[start,end)</code>,
  925. * after decoding the region through the specified character set.
  926. */
  927. public static String decode(final byte[] buffer, final int start,
  928. final int end) {
  929. return decode(UTF_8, buffer, start, end);
  930. }
  931. /**
  932. * Decode a buffer under the specified character set if possible.
  933. *
  934. * If the byte stream cannot be decoded that way, the platform default is tried
  935. * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
  936. *
  937. * @param cs
  938. * character set to use when decoding the buffer.
  939. * @param buffer
  940. * buffer to pull raw bytes from.
  941. * @return a string representation of the range <code>[start,end)</code>,
  942. * after decoding the region through the specified character set.
  943. */
  944. public static String decode(final Charset cs, final byte[] buffer) {
  945. return decode(cs, buffer, 0, buffer.length);
  946. }
  947. /**
  948. * Decode a region of the buffer under the specified character set if possible.
  949. *
  950. * If the byte stream cannot be decoded that way, the platform default is tried
  951. * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
  952. *
  953. * @param cs
  954. * character set to use when decoding the buffer.
  955. * @param buffer
  956. * buffer to pull raw bytes from.
  957. * @param start
  958. * first position within the buffer to take data from.
  959. * @param end
  960. * one position past the last location within the buffer to take
  961. * data from.
  962. * @return a string representation of the range <code>[start,end)</code>,
  963. * after decoding the region through the specified character set.
  964. */
  965. public static String decode(final Charset cs, final byte[] buffer,
  966. final int start, final int end) {
  967. try {
  968. return decodeNoFallback(cs, buffer, start, end);
  969. } catch (CharacterCodingException e) {
  970. // Fall back to an ISO-8859-1 style encoding. At least all of
  971. // the bytes will be present in the output.
  972. //
  973. return extractBinaryString(buffer, start, end);
  974. }
  975. }
  976. /**
  977. * Decode a region of the buffer under the specified character set if
  978. * possible.
  979. *
  980. * If the byte stream cannot be decoded that way, the platform default is
  981. * tried and if that too fails, an exception is thrown.
  982. *
  983. * @param cs
  984. * character set to use when decoding the buffer.
  985. * @param buffer
  986. * buffer to pull raw bytes from.
  987. * @param start
  988. * first position within the buffer to take data from.
  989. * @param end
  990. * one position past the last location within the buffer to take
  991. * data from.
  992. * @return a string representation of the range <code>[start,end)</code>,
  993. * after decoding the region through the specified character set.
  994. * @throws CharacterCodingException
  995. * the input is not in any of the tested character sets.
  996. */
  997. public static String decodeNoFallback(final Charset cs,
  998. final byte[] buffer, final int start, final int end)
  999. throws CharacterCodingException {
  1000. ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start);
  1001. b.mark();
  1002. // Try our built-in favorite. The assumption here is that
  1003. // decoding will fail if the data is not actually encoded
  1004. // using that encoder.
  1005. try {
  1006. return decode(b, UTF_8);
  1007. } catch (CharacterCodingException e) {
  1008. b.reset();
  1009. }
  1010. if (!cs.equals(UTF_8)) {
  1011. // Try the suggested encoding, it might be right since it was
  1012. // provided by the caller.
  1013. try {
  1014. return decode(b, cs);
  1015. } catch (CharacterCodingException e) {
  1016. b.reset();
  1017. }
  1018. }
  1019. // Try the default character set. A small group of people
  1020. // might actually use the same (or very similar) locale.
  1021. Charset defcs = Charset.defaultCharset();
  1022. if (!defcs.equals(cs) && !defcs.equals(UTF_8)) {
  1023. try {
  1024. return decode(b, defcs);
  1025. } catch (CharacterCodingException e) {
  1026. b.reset();
  1027. }
  1028. }
  1029. throw new CharacterCodingException();
  1030. }
  1031. /**
  1032. * Decode a region of the buffer under the ISO-8859-1 encoding.
  1033. *
  1034. * Each byte is treated as a single character in the 8859-1 character
  1035. * encoding, performing a raw binary-&gt;char conversion.
  1036. *
  1037. * @param buffer
  1038. * buffer to pull raw bytes from.
  1039. * @param start
  1040. * first position within the buffer to take data from.
  1041. * @param end
  1042. * one position past the last location within the buffer to take
  1043. * data from.
  1044. * @return a string representation of the range <code>[start,end)</code>.
  1045. */
  1046. public static String extractBinaryString(final byte[] buffer,
  1047. final int start, final int end) {
  1048. final StringBuilder r = new StringBuilder(end - start);
  1049. for (int i = start; i < end; i++)
  1050. r.append((char) (buffer[i] & 0xff));
  1051. return r.toString();
  1052. }
  1053. private static String decode(final ByteBuffer b, final Charset charset)
  1054. throws CharacterCodingException {
  1055. final CharsetDecoder d = charset.newDecoder();
  1056. d.onMalformedInput(CodingErrorAction.REPORT);
  1057. d.onUnmappableCharacter(CodingErrorAction.REPORT);
  1058. return d.decode(b).toString();
  1059. }
  1060. /**
  1061. * Locate the position of the commit message body.
  1062. *
  1063. * @param b
  1064. * buffer to scan.
  1065. * @param ptr
  1066. * position in buffer to start the scan at. Most callers should
  1067. * pass 0 to ensure the scan starts from the beginning of the
  1068. * commit buffer.
  1069. * @return position of the user's message buffer.
  1070. */
  1071. public static final int commitMessage(final byte[] b, int ptr) {
  1072. final int sz = b.length;
  1073. if (ptr == 0)
  1074. ptr += 46; // skip the "tree ..." line.
  1075. while (ptr < sz && b[ptr] == 'p')
  1076. ptr += 48; // skip this parent.
  1077. // Skip any remaining header lines, ignoring what their actual
  1078. // header line type is. This is identical to the logic for a tag.
  1079. //
  1080. return tagMessage(b, ptr);
  1081. }
  1082. /**
  1083. * Locate the position of the tag message body.
  1084. *
  1085. * @param b
  1086. * buffer to scan.
  1087. * @param ptr
  1088. * position in buffer to start the scan at. Most callers should
  1089. * pass 0 to ensure the scan starts from the beginning of the tag
  1090. * buffer.
  1091. * @return position of the user's message buffer.
  1092. */
  1093. public static final int tagMessage(final byte[] b, int ptr) {
  1094. final int sz = b.length;
  1095. if (ptr == 0)
  1096. ptr += 48; // skip the "object ..." line.
  1097. while (ptr < sz && b[ptr] != '\n')
  1098. ptr = nextLF(b, ptr);
  1099. if (ptr < sz && b[ptr] == '\n')
  1100. return ptr + 1;
  1101. return -1;
  1102. }
  1103. /**
  1104. * Locate the end of a paragraph.
  1105. * <p>
  1106. * A paragraph is ended by two consecutive LF bytes or CRLF pairs
  1107. *
  1108. * @param b
  1109. * buffer to scan.
  1110. * @param start
  1111. * position in buffer to start the scan at. Most callers will
  1112. * want to pass the first position of the commit message (as
  1113. * found by {@link #commitMessage(byte[], int)}.
  1114. * @return position of the LF at the end of the paragraph;
  1115. * <code>b.length</code> if no paragraph end could be located.
  1116. */
  1117. public static final int endOfParagraph(final byte[] b, final int start) {
  1118. int ptr = start;
  1119. final int sz = b.length;
  1120. while (ptr < sz && (b[ptr] != '\n' && b[ptr] != '\r'))
  1121. ptr = nextLF(b, ptr);
  1122. if (ptr > start && b[ptr - 1] == '\n')
  1123. ptr--;
  1124. if (ptr > start && b[ptr - 1] == '\r')
  1125. ptr--;
  1126. return ptr;
  1127. }
  1128. /**
  1129. * @param raw
  1130. * buffer to scan.
  1131. * @param ch
  1132. * character to find.
  1133. * @param pos
  1134. * starting position.
  1135. * @return last index of ch in raw, trimming spaces.
  1136. * @since 4.1
  1137. */
  1138. public static int lastIndexOfTrim(byte[] raw, char ch, int pos) {
  1139. while (pos >= 0 && raw[pos] == ' ')
  1140. pos--;
  1141. while (pos >= 0 && raw[pos] != ch)
  1142. pos--;
  1143. return pos;
  1144. }
  1145. private static Charset charsetForAlias(String name) {
  1146. return encodingAliases.get(StringUtils.toLowerCase(name));
  1147. }
  1148. private RawParseUtils() {
  1149. // Don't create instances of a static only utility.
  1150. }
  1151. }