You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

RawParseUtils.java 37KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255
  1. /*
  2. * Copyright (C) 2008-2009, Google Inc.
  3. * Copyright (C) 2006-2008, Shawn O. Pearce <spearce@spearce.org>
  4. * and other copyright owners as documented in the project's IP log.
  5. *
  6. * This program and the accompanying materials are made available
  7. * under the terms of the Eclipse Distribution License v1.0 which
  8. * accompanies this distribution, is reproduced below, and is
  9. * available at http://www.eclipse.org/org/documents/edl-v10.php
  10. *
  11. * All rights reserved.
  12. *
  13. * Redistribution and use in source and binary forms, with or
  14. * without modification, are permitted provided that the following
  15. * conditions are met:
  16. *
  17. * - Redistributions of source code must retain the above copyright
  18. * notice, this list of conditions and the following disclaimer.
  19. *
  20. * - Redistributions in binary form must reproduce the above
  21. * copyright notice, this list of conditions and the following
  22. * disclaimer in the documentation and/or other materials provided
  23. * with the distribution.
  24. *
  25. * - Neither the name of the Eclipse Foundation, Inc. nor the
  26. * names of its contributors may be used to endorse or promote
  27. * products derived from this software without specific prior
  28. * written permission.
  29. *
  30. * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
  31. * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
  32. * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  33. * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  34. * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
  35. * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  36. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  37. * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  38. * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  39. * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  40. * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  41. * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
  42. * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  43. */
  44. package org.eclipse.jgit.util;
  45. import static java.nio.charset.StandardCharsets.ISO_8859_1;
  46. import static java.nio.charset.StandardCharsets.UTF_8;
  47. import static org.eclipse.jgit.lib.ObjectChecker.author;
  48. import static org.eclipse.jgit.lib.ObjectChecker.committer;
  49. import static org.eclipse.jgit.lib.ObjectChecker.encoding;
  50. import static org.eclipse.jgit.lib.ObjectChecker.tagger;
  51. import java.nio.ByteBuffer;
  52. import java.nio.charset.CharacterCodingException;
  53. import java.nio.charset.Charset;
  54. import java.nio.charset.CharsetDecoder;
  55. import java.nio.charset.CodingErrorAction;
  56. import java.nio.charset.IllegalCharsetNameException;
  57. import java.nio.charset.UnsupportedCharsetException;
  58. import java.util.Arrays;
  59. import java.util.HashMap;
  60. import java.util.Map;
  61. import org.eclipse.jgit.annotations.Nullable;
  62. import org.eclipse.jgit.lib.Constants;
  63. import org.eclipse.jgit.lib.PersonIdent;
  64. /**
  65. * Handy utility functions to parse raw object contents.
  66. */
  67. public final class RawParseUtils {
  68. /**
  69. * UTF-8 charset constant.
  70. *
  71. * @since 2.2
  72. */
  73. public static final Charset UTF8_CHARSET = UTF_8;
  74. private static final byte[] digits10;
  75. private static final byte[] digits16;
  76. private static final byte[] footerLineKeyChars;
  77. private static final Map<String, Charset> encodingAliases;
  78. static {
  79. encodingAliases = new HashMap<>();
  80. encodingAliases.put("latin-1", ISO_8859_1); //$NON-NLS-1$
  81. encodingAliases.put("iso-latin-1", ISO_8859_1); //$NON-NLS-1$
  82. digits10 = new byte['9' + 1];
  83. Arrays.fill(digits10, (byte) -1);
  84. for (char i = '0'; i <= '9'; i++)
  85. digits10[i] = (byte) (i - '0');
  86. digits16 = new byte['f' + 1];
  87. Arrays.fill(digits16, (byte) -1);
  88. for (char i = '0'; i <= '9'; i++)
  89. digits16[i] = (byte) (i - '0');
  90. for (char i = 'a'; i <= 'f'; i++)
  91. digits16[i] = (byte) ((i - 'a') + 10);
  92. for (char i = 'A'; i <= 'F'; i++)
  93. digits16[i] = (byte) ((i - 'A') + 10);
  94. footerLineKeyChars = new byte['z' + 1];
  95. footerLineKeyChars['-'] = 1;
  96. for (char i = '0'; i <= '9'; i++)
  97. footerLineKeyChars[i] = 1;
  98. for (char i = 'A'; i <= 'Z'; i++)
  99. footerLineKeyChars[i] = 1;
  100. for (char i = 'a'; i <= 'z'; i++)
  101. footerLineKeyChars[i] = 1;
  102. }
  103. /**
  104. * Determine if b[ptr] matches src.
  105. *
  106. * @param b
  107. * the buffer to scan.
  108. * @param ptr
  109. * first position within b, this should match src[0].
  110. * @param src
  111. * the buffer to test for equality with b.
  112. * @return ptr + src.length if b[ptr..src.length] == src; else -1.
  113. */
  114. public static final int match(final byte[] b, int ptr, final byte[] src) {
  115. if (ptr + src.length > b.length)
  116. return -1;
  117. for (int i = 0; i < src.length; i++, ptr++)
  118. if (b[ptr] != src[i])
  119. return -1;
  120. return ptr;
  121. }
  122. private static final byte[] base10byte = { '0', '1', '2', '3', '4', '5',
  123. '6', '7', '8', '9' };
  124. /**
  125. * Format a base 10 numeric into a temporary buffer.
  126. * <p>
  127. * Formatting is performed backwards. The method starts at offset
  128. * <code>o-1</code> and ends at <code>o-1-digits</code>, where
  129. * <code>digits</code> is the number of positions necessary to store the
  130. * base 10 value.
  131. * <p>
  132. * The argument and return values from this method make it easy to chain
  133. * writing, for example:
  134. * </p>
  135. *
  136. * <pre>
  137. * final byte[] tmp = new byte[64];
  138. * int ptr = tmp.length;
  139. * tmp[--ptr] = '\n';
  140. * ptr = RawParseUtils.formatBase10(tmp, ptr, 32);
  141. * tmp[--ptr] = ' ';
  142. * ptr = RawParseUtils.formatBase10(tmp, ptr, 18);
  143. * tmp[--ptr] = 0;
  144. * final String str = new String(tmp, ptr, tmp.length - ptr);
  145. * </pre>
  146. *
  147. * @param b
  148. * buffer to write into.
  149. * @param o
  150. * one offset past the location where writing will begin; writing
  151. * proceeds towards lower index values.
  152. * @param value
  153. * the value to store.
  154. * @return the new offset value <code>o</code>. This is the position of
  155. * the last byte written. Additional writing should start at one
  156. * position earlier.
  157. */
  158. public static int formatBase10(final byte[] b, int o, int value) {
  159. if (value == 0) {
  160. b[--o] = '0';
  161. return o;
  162. }
  163. final boolean isneg = value < 0;
  164. if (isneg)
  165. value = -value;
  166. while (value != 0) {
  167. b[--o] = base10byte[value % 10];
  168. value /= 10;
  169. }
  170. if (isneg)
  171. b[--o] = '-';
  172. return o;
  173. }
  174. /**
  175. * Parse a base 10 numeric from a sequence of ASCII digits into an int.
  176. * <p>
  177. * Digit sequences can begin with an optional run of spaces before the
  178. * sequence, and may start with a '+' or a '-' to indicate sign position.
  179. * Any other characters will cause the method to stop and return the current
  180. * result to the caller.
  181. *
  182. * @param b
  183. * buffer to scan.
  184. * @param ptr
  185. * position within buffer to start parsing digits at.
  186. * @param ptrResult
  187. * optional location to return the new ptr value through. If null
  188. * the ptr value will be discarded.
  189. * @return the value at this location; 0 if the location is not a valid
  190. * numeric.
  191. */
  192. public static final int parseBase10(final byte[] b, int ptr,
  193. final MutableInteger ptrResult) {
  194. int r = 0;
  195. int sign = 0;
  196. try {
  197. final int sz = b.length;
  198. while (ptr < sz && b[ptr] == ' ')
  199. ptr++;
  200. if (ptr >= sz)
  201. return 0;
  202. switch (b[ptr]) {
  203. case '-':
  204. sign = -1;
  205. ptr++;
  206. break;
  207. case '+':
  208. ptr++;
  209. break;
  210. }
  211. while (ptr < sz) {
  212. final byte v = digits10[b[ptr]];
  213. if (v < 0)
  214. break;
  215. r = (r * 10) + v;
  216. ptr++;
  217. }
  218. } catch (ArrayIndexOutOfBoundsException e) {
  219. // Not a valid digit.
  220. }
  221. if (ptrResult != null)
  222. ptrResult.value = ptr;
  223. return sign < 0 ? -r : r;
  224. }
  225. /**
  226. * Parse a base 10 numeric from a sequence of ASCII digits into a long.
  227. * <p>
  228. * Digit sequences can begin with an optional run of spaces before the
  229. * sequence, and may start with a '+' or a '-' to indicate sign position.
  230. * Any other characters will cause the method to stop and return the current
  231. * result to the caller.
  232. *
  233. * @param b
  234. * buffer to scan.
  235. * @param ptr
  236. * position within buffer to start parsing digits at.
  237. * @param ptrResult
  238. * optional location to return the new ptr value through. If null
  239. * the ptr value will be discarded.
  240. * @return the value at this location; 0 if the location is not a valid
  241. * numeric.
  242. */
  243. public static final long parseLongBase10(final byte[] b, int ptr,
  244. final MutableInteger ptrResult) {
  245. long r = 0;
  246. int sign = 0;
  247. try {
  248. final int sz = b.length;
  249. while (ptr < sz && b[ptr] == ' ')
  250. ptr++;
  251. if (ptr >= sz)
  252. return 0;
  253. switch (b[ptr]) {
  254. case '-':
  255. sign = -1;
  256. ptr++;
  257. break;
  258. case '+':
  259. ptr++;
  260. break;
  261. }
  262. while (ptr < sz) {
  263. final byte v = digits10[b[ptr]];
  264. if (v < 0)
  265. break;
  266. r = (r * 10) + v;
  267. ptr++;
  268. }
  269. } catch (ArrayIndexOutOfBoundsException e) {
  270. // Not a valid digit.
  271. }
  272. if (ptrResult != null)
  273. ptrResult.value = ptr;
  274. return sign < 0 ? -r : r;
  275. }
  276. /**
  277. * Parse 4 character base 16 (hex) formatted string to unsigned integer.
  278. * <p>
  279. * The number is read in network byte order, that is, most significant
  280. * nybble first.
  281. *
  282. * @param bs
  283. * buffer to parse digits from; positions {@code [p, p+4)} will
  284. * be parsed.
  285. * @param p
  286. * first position within the buffer to parse.
  287. * @return the integer value.
  288. * @throws java.lang.ArrayIndexOutOfBoundsException
  289. * if the string is not hex formatted.
  290. */
  291. public static final int parseHexInt16(final byte[] bs, final int p) {
  292. int r = digits16[bs[p]] << 4;
  293. r |= digits16[bs[p + 1]];
  294. r <<= 4;
  295. r |= digits16[bs[p + 2]];
  296. r <<= 4;
  297. r |= digits16[bs[p + 3]];
  298. if (r < 0)
  299. throw new ArrayIndexOutOfBoundsException();
  300. return r;
  301. }
  302. /**
  303. * Parse 8 character base 16 (hex) formatted string to unsigned integer.
  304. * <p>
  305. * The number is read in network byte order, that is, most significant
  306. * nybble first.
  307. *
  308. * @param bs
  309. * buffer to parse digits from; positions {@code [p, p+8)} will
  310. * be parsed.
  311. * @param p
  312. * first position within the buffer to parse.
  313. * @return the integer value.
  314. * @throws java.lang.ArrayIndexOutOfBoundsException
  315. * if the string is not hex formatted.
  316. */
  317. public static final int parseHexInt32(final byte[] bs, final int p) {
  318. int r = digits16[bs[p]] << 4;
  319. r |= digits16[bs[p + 1]];
  320. r <<= 4;
  321. r |= digits16[bs[p + 2]];
  322. r <<= 4;
  323. r |= digits16[bs[p + 3]];
  324. r <<= 4;
  325. r |= digits16[bs[p + 4]];
  326. r <<= 4;
  327. r |= digits16[bs[p + 5]];
  328. r <<= 4;
  329. r |= digits16[bs[p + 6]];
  330. final int last = digits16[bs[p + 7]];
  331. if (r < 0 || last < 0)
  332. throw new ArrayIndexOutOfBoundsException();
  333. return (r << 4) | last;
  334. }
  335. /**
  336. * Parse 16 character base 16 (hex) formatted string to unsigned long.
  337. * <p>
  338. * The number is read in network byte order, that is, most significant
  339. * nibble first.
  340. *
  341. * @param bs
  342. * buffer to parse digits from; positions {@code [p, p+16)} will
  343. * be parsed.
  344. * @param p
  345. * first position within the buffer to parse.
  346. * @return the integer value.
  347. * @throws java.lang.ArrayIndexOutOfBoundsException
  348. * if the string is not hex formatted.
  349. * @since 4.3
  350. */
  351. public static final long parseHexInt64(final byte[] bs, final int p) {
  352. long r = digits16[bs[p]] << 4;
  353. r |= digits16[bs[p + 1]];
  354. r <<= 4;
  355. r |= digits16[bs[p + 2]];
  356. r <<= 4;
  357. r |= digits16[bs[p + 3]];
  358. r <<= 4;
  359. r |= digits16[bs[p + 4]];
  360. r <<= 4;
  361. r |= digits16[bs[p + 5]];
  362. r <<= 4;
  363. r |= digits16[bs[p + 6]];
  364. r <<= 4;
  365. r |= digits16[bs[p + 7]];
  366. r <<= 4;
  367. r |= digits16[bs[p + 8]];
  368. r <<= 4;
  369. r |= digits16[bs[p + 9]];
  370. r <<= 4;
  371. r |= digits16[bs[p + 10]];
  372. r <<= 4;
  373. r |= digits16[bs[p + 11]];
  374. r <<= 4;
  375. r |= digits16[bs[p + 12]];
  376. r <<= 4;
  377. r |= digits16[bs[p + 13]];
  378. r <<= 4;
  379. r |= digits16[bs[p + 14]];
  380. final int last = digits16[bs[p + 15]];
  381. if (r < 0 || last < 0)
  382. throw new ArrayIndexOutOfBoundsException();
  383. return (r << 4) | last;
  384. }
  385. /**
  386. * Parse a single hex digit to its numeric value (0-15).
  387. *
  388. * @param digit
  389. * hex character to parse.
  390. * @return numeric value, in the range 0-15.
  391. * @throws java.lang.ArrayIndexOutOfBoundsException
  392. * if the input digit is not a valid hex digit.
  393. */
  394. public static final int parseHexInt4(final byte digit) {
  395. final byte r = digits16[digit];
  396. if (r < 0)
  397. throw new ArrayIndexOutOfBoundsException();
  398. return r;
  399. }
  400. /**
  401. * Parse a Git style timezone string.
  402. * <p>
  403. * The sequence "-0315" will be parsed as the numeric value -195, as the
  404. * lower two positions count minutes, not 100ths of an hour.
  405. *
  406. * @param b
  407. * buffer to scan.
  408. * @param ptr
  409. * position within buffer to start parsing digits at.
  410. * @return the timezone at this location, expressed in minutes.
  411. */
  412. public static final int parseTimeZoneOffset(final byte[] b, int ptr) {
  413. return parseTimeZoneOffset(b, ptr, null);
  414. }
  415. /**
  416. * Parse a Git style timezone string.
  417. * <p>
  418. * The sequence "-0315" will be parsed as the numeric value -195, as the
  419. * lower two positions count minutes, not 100ths of an hour.
  420. *
  421. * @param b
  422. * buffer to scan.
  423. * @param ptr
  424. * position within buffer to start parsing digits at.
  425. * @param ptrResult
  426. * optional location to return the new ptr value through. If null
  427. * the ptr value will be discarded.
  428. * @return the timezone at this location, expressed in minutes.
  429. * @since 4.1
  430. */
  431. public static final int parseTimeZoneOffset(final byte[] b, int ptr,
  432. MutableInteger ptrResult) {
  433. final int v = parseBase10(b, ptr, ptrResult);
  434. final int tzMins = v % 100;
  435. final int tzHours = v / 100;
  436. return tzHours * 60 + tzMins;
  437. }
  438. /**
  439. * Locate the first position after a given character.
  440. *
  441. * @param b
  442. * buffer to scan.
  443. * @param ptr
  444. * position within buffer to start looking for chrA at.
  445. * @param chrA
  446. * character to find.
  447. * @return new position just after chrA.
  448. */
  449. public static final int next(final byte[] b, int ptr, final char chrA) {
  450. final int sz = b.length;
  451. while (ptr < sz) {
  452. if (b[ptr++] == chrA)
  453. return ptr;
  454. }
  455. return ptr;
  456. }
  457. /**
  458. * Locate the first position after the next LF.
  459. * <p>
  460. * This method stops on the first '\n' it finds.
  461. *
  462. * @param b
  463. * buffer to scan.
  464. * @param ptr
  465. * position within buffer to start looking for LF at.
  466. * @return new position just after the first LF found.
  467. */
  468. public static final int nextLF(final byte[] b, int ptr) {
  469. return next(b, ptr, '\n');
  470. }
  471. /**
  472. * Locate the first position after either the given character or LF.
  473. * <p>
  474. * This method stops on the first match it finds from either chrA or '\n'.
  475. *
  476. * @param b
  477. * buffer to scan.
  478. * @param ptr
  479. * position within buffer to start looking for chrA or LF at.
  480. * @param chrA
  481. * character to find.
  482. * @return new position just after the first chrA or LF to be found.
  483. */
  484. public static final int nextLF(final byte[] b, int ptr, final char chrA) {
  485. final int sz = b.length;
  486. while (ptr < sz) {
  487. final byte c = b[ptr++];
  488. if (c == chrA || c == '\n')
  489. return ptr;
  490. }
  491. return ptr;
  492. }
  493. /**
  494. * Locate the first position before a given character.
  495. *
  496. * @param b
  497. * buffer to scan.
  498. * @param ptr
  499. * position within buffer to start looking for chrA at.
  500. * @param chrA
  501. * character to find.
  502. * @return new position just before chrA, -1 for not found
  503. */
  504. public static final int prev(final byte[] b, int ptr, final char chrA) {
  505. if (ptr == b.length)
  506. --ptr;
  507. while (ptr >= 0) {
  508. if (b[ptr--] == chrA)
  509. return ptr;
  510. }
  511. return ptr;
  512. }
  513. /**
  514. * Locate the first position before the previous LF.
  515. * <p>
  516. * This method stops on the first '\n' it finds.
  517. *
  518. * @param b
  519. * buffer to scan.
  520. * @param ptr
  521. * position within buffer to start looking for LF at.
  522. * @return new position just before the first LF found, -1 for not found
  523. */
  524. public static final int prevLF(final byte[] b, int ptr) {
  525. return prev(b, ptr, '\n');
  526. }
  527. /**
  528. * Locate the previous position before either the given character or LF.
  529. * <p>
  530. * This method stops on the first match it finds from either chrA or '\n'.
  531. *
  532. * @param b
  533. * buffer to scan.
  534. * @param ptr
  535. * position within buffer to start looking for chrA or LF at.
  536. * @param chrA
  537. * character to find.
  538. * @return new position just before the first chrA or LF to be found, -1 for
  539. * not found
  540. */
  541. public static final int prevLF(final byte[] b, int ptr, final char chrA) {
  542. if (ptr == b.length)
  543. --ptr;
  544. while (ptr >= 0) {
  545. final byte c = b[ptr--];
  546. if (c == chrA || c == '\n')
  547. return ptr;
  548. }
  549. return ptr;
  550. }
  551. /**
  552. * Index the region between <code>[ptr, end)</code> to find line starts.
  553. * <p>
  554. * The returned list is 1 indexed. Index 0 contains
  555. * {@link java.lang.Integer#MIN_VALUE} to pad the list out.
  556. * <p>
  557. * Using a 1 indexed list means that line numbers can be directly accessed
  558. * from the list, so <code>list.get(1)</code> (aka get line 1) returns
  559. * <code>ptr</code>.
  560. * <p>
  561. * The last element (index <code>map.size()-1</code>) always contains
  562. * <code>end</code>.
  563. * <p>
  564. * If the data contains a '\0' anywhere, the whole region is considered
  565. * binary and a LineMap corresponding to a single line is returned.
  566. * </p>
  567. *
  568. * @param buf
  569. * buffer to scan.
  570. * @param ptr
  571. * position within the buffer corresponding to the first byte of
  572. * line 1.
  573. * @param end
  574. * 1 past the end of the content within <code>buf</code>.
  575. * @return a line map indexing the start position of each line.
  576. */
  577. public static final IntList lineMap(final byte[] buf, int ptr, int end) {
  578. int start = ptr;
  579. // Experimentally derived from multiple source repositories
  580. // the average number of bytes/line is 36. Its a rough guess
  581. // to initially size our map close to the target.
  582. IntList map = new IntList((end - ptr) / 36);
  583. map.add(Integer.MIN_VALUE);
  584. boolean foundLF = true;
  585. for (; ptr < end; ptr++) {
  586. if (foundLF) {
  587. map.add(ptr);
  588. }
  589. if (buf[ptr] == '\0') {
  590. // binary data.
  591. map = new IntList(3);
  592. map.add(Integer.MIN_VALUE);
  593. map.add(start);
  594. break;
  595. }
  596. foundLF = (buf[ptr] == '\n');
  597. }
  598. map.add(end);
  599. return map;
  600. }
  601. /**
  602. * Locate the "author " header line data.
  603. *
  604. * @param b
  605. * buffer to scan.
  606. * @param ptr
  607. * position in buffer to start the scan at. Most callers should
  608. * pass 0 to ensure the scan starts from the beginning of the
  609. * commit buffer and does not accidentally look at message body.
  610. * @return position just after the space in "author ", so the first
  611. * character of the author's name. If no author header can be
  612. * located -1 is returned.
  613. */
  614. public static final int author(final byte[] b, int ptr) {
  615. final int sz = b.length;
  616. if (ptr == 0)
  617. ptr += 46; // skip the "tree ..." line.
  618. while (ptr < sz && b[ptr] == 'p')
  619. ptr += 48; // skip this parent.
  620. return match(b, ptr, author);
  621. }
  622. /**
  623. * Locate the "committer " header line data.
  624. *
  625. * @param b
  626. * buffer to scan.
  627. * @param ptr
  628. * position in buffer to start the scan at. Most callers should
  629. * pass 0 to ensure the scan starts from the beginning of the
  630. * commit buffer and does not accidentally look at message body.
  631. * @return position just after the space in "committer ", so the first
  632. * character of the committer's name. If no committer header can be
  633. * located -1 is returned.
  634. */
  635. public static final int committer(final byte[] b, int ptr) {
  636. final int sz = b.length;
  637. if (ptr == 0)
  638. ptr += 46; // skip the "tree ..." line.
  639. while (ptr < sz && b[ptr] == 'p')
  640. ptr += 48; // skip this parent.
  641. if (ptr < sz && b[ptr] == 'a')
  642. ptr = nextLF(b, ptr);
  643. return match(b, ptr, committer);
  644. }
  645. /**
  646. * Locate the "tagger " header line data.
  647. *
  648. * @param b
  649. * buffer to scan.
  650. * @param ptr
  651. * position in buffer to start the scan at. Most callers should
  652. * pass 0 to ensure the scan starts from the beginning of the tag
  653. * buffer and does not accidentally look at message body.
  654. * @return position just after the space in "tagger ", so the first
  655. * character of the tagger's name. If no tagger header can be
  656. * located -1 is returned.
  657. */
  658. public static final int tagger(final byte[] b, int ptr) {
  659. final int sz = b.length;
  660. if (ptr == 0)
  661. ptr += 48; // skip the "object ..." line.
  662. while (ptr < sz) {
  663. if (b[ptr] == '\n')
  664. return -1;
  665. final int m = match(b, ptr, tagger);
  666. if (m >= 0)
  667. return m;
  668. ptr = nextLF(b, ptr);
  669. }
  670. return -1;
  671. }
  672. /**
  673. * Locate the "encoding " header line.
  674. *
  675. * @param b
  676. * buffer to scan.
  677. * @param ptr
  678. * position in buffer to start the scan at. Most callers should
  679. * pass 0 to ensure the scan starts from the beginning of the
  680. * buffer and does not accidentally look at the message body.
  681. * @return position just after the space in "encoding ", so the first
  682. * character of the encoding's name. If no encoding header can be
  683. * located -1 is returned (and UTF-8 should be assumed).
  684. */
  685. public static final int encoding(final byte[] b, int ptr) {
  686. final int sz = b.length;
  687. while (ptr < sz) {
  688. if (b[ptr] == '\n')
  689. return -1;
  690. if (b[ptr] == 'e')
  691. break;
  692. ptr = nextLF(b, ptr);
  693. }
  694. return match(b, ptr, encoding);
  695. }
  696. /**
  697. * Parse the "encoding " header as a string.
  698. * <p>
  699. * Locates the "encoding " header (if present) and returns its value.
  700. *
  701. * @param b
  702. * buffer to scan.
  703. * @return the encoding header as specified in the commit; null if the
  704. * header was not present and should be assumed.
  705. * @since 4.2
  706. */
  707. @Nullable
  708. public static String parseEncodingName(final byte[] b) {
  709. int enc = encoding(b, 0);
  710. if (enc < 0) {
  711. return null;
  712. }
  713. int lf = nextLF(b, enc);
  714. return decode(UTF_8, b, enc, lf - 1);
  715. }
  716. /**
  717. * Parse the "encoding " header into a character set reference.
  718. * <p>
  719. * Locates the "encoding " header (if present) by first calling
  720. * {@link #encoding(byte[], int)} and then returns the proper character set
  721. * to apply to this buffer to evaluate its contents as character data.
  722. * <p>
  723. * If no encoding header is present {@code UTF-8} is assumed.
  724. *
  725. * @param b
  726. * buffer to scan.
  727. * @return the Java character set representation. Never null.
  728. * @throws IllegalCharsetNameException
  729. * if the character set requested by the encoding header is
  730. * malformed and unsupportable.
  731. * @throws UnsupportedCharsetException
  732. * if the JRE does not support the character set requested by
  733. * the encoding header.
  734. */
  735. public static Charset parseEncoding(final byte[] b) {
  736. String enc = parseEncodingName(b);
  737. if (enc == null) {
  738. return UTF_8;
  739. }
  740. String name = enc.trim();
  741. try {
  742. return Charset.forName(name);
  743. } catch (IllegalCharsetNameException
  744. | UnsupportedCharsetException badName) {
  745. Charset aliased = charsetForAlias(name);
  746. if (aliased != null) {
  747. return aliased;
  748. }
  749. throw badName;
  750. }
  751. }
  752. /**
  753. * Parse a name string (e.g. author, committer, tagger) into a PersonIdent.
  754. * <p>
  755. * Leading spaces won't be trimmed from the string, i.e. will show up in the
  756. * parsed name afterwards.
  757. *
  758. * @param in
  759. * the string to parse a name from.
  760. * @return the parsed identity or null in case the identity could not be
  761. * parsed.
  762. */
  763. public static PersonIdent parsePersonIdent(final String in) {
  764. return parsePersonIdent(Constants.encode(in), 0);
  765. }
  766. /**
  767. * Parse a name line (e.g. author, committer, tagger) into a PersonIdent.
  768. * <p>
  769. * When passing in a value for <code>nameB</code> callers should use the
  770. * return value of {@link #author(byte[], int)} or
  771. * {@link #committer(byte[], int)}, as these methods provide the proper
  772. * position within the buffer.
  773. *
  774. * @param raw
  775. * the buffer to parse character data from.
  776. * @param nameB
  777. * first position of the identity information. This should be the
  778. * first position after the space which delimits the header field
  779. * name (e.g. "author" or "committer") from the rest of the
  780. * identity line.
  781. * @return the parsed identity or null in case the identity could not be
  782. * parsed.
  783. */
  784. public static PersonIdent parsePersonIdent(final byte[] raw, final int nameB) {
  785. Charset cs;
  786. try {
  787. cs = parseEncoding(raw);
  788. } catch (IllegalCharsetNameException | UnsupportedCharsetException e) {
  789. // Assume UTF-8 for person identities, usually this is correct.
  790. // If not decode() will fall back to the ISO-8859-1 encoding.
  791. cs = UTF_8;
  792. }
  793. final int emailB = nextLF(raw, nameB, '<');
  794. final int emailE = nextLF(raw, emailB, '>');
  795. if (emailB >= raw.length || raw[emailB] == '\n' ||
  796. (emailE >= raw.length - 1 && raw[emailE - 1] != '>'))
  797. return null;
  798. final int nameEnd = emailB - 2 >= nameB && raw[emailB - 2] == ' ' ?
  799. emailB - 2 : emailB - 1;
  800. final String name = decode(cs, raw, nameB, nameEnd);
  801. final String email = decode(cs, raw, emailB, emailE - 1);
  802. // Start searching from end of line, as after first name-email pair,
  803. // another name-email pair may occur. We will ignore all kinds of
  804. // "junk" following the first email.
  805. //
  806. // We've to use (emailE - 1) for the case that raw[email] is LF,
  807. // otherwise we would run too far. "-2" is necessary to position
  808. // before the LF in case of LF termination resp. the penultimate
  809. // character if there is no trailing LF.
  810. final int tzBegin = lastIndexOfTrim(raw, ' ',
  811. nextLF(raw, emailE - 1) - 2) + 1;
  812. if (tzBegin <= emailE) // No time/zone, still valid
  813. return new PersonIdent(name, email, 0, 0);
  814. final int whenBegin = Math.max(emailE,
  815. lastIndexOfTrim(raw, ' ', tzBegin - 1) + 1);
  816. if (whenBegin >= tzBegin - 1) // No time/zone, still valid
  817. return new PersonIdent(name, email, 0, 0);
  818. final long when = parseLongBase10(raw, whenBegin, null);
  819. final int tz = parseTimeZoneOffset(raw, tzBegin);
  820. return new PersonIdent(name, email, when * 1000L, tz);
  821. }
  822. /**
  823. * Parse a name data (e.g. as within a reflog) into a PersonIdent.
  824. * <p>
  825. * When passing in a value for <code>nameB</code> callers should use the
  826. * return value of {@link #author(byte[], int)} or
  827. * {@link #committer(byte[], int)}, as these methods provide the proper
  828. * position within the buffer.
  829. *
  830. * @param raw
  831. * the buffer to parse character data from.
  832. * @param nameB
  833. * first position of the identity information. This should be the
  834. * first position after the space which delimits the header field
  835. * name (e.g. "author" or "committer") from the rest of the
  836. * identity line.
  837. * @return the parsed identity. Never null.
  838. */
  839. public static PersonIdent parsePersonIdentOnly(final byte[] raw,
  840. final int nameB) {
  841. int stop = nextLF(raw, nameB);
  842. int emailB = nextLF(raw, nameB, '<');
  843. int emailE = nextLF(raw, emailB, '>');
  844. final String name;
  845. final String email;
  846. if (emailE < stop) {
  847. email = decode(raw, emailB, emailE - 1);
  848. } else {
  849. email = "invalid"; //$NON-NLS-1$
  850. }
  851. if (emailB < stop)
  852. name = decode(raw, nameB, emailB - 2);
  853. else
  854. name = decode(raw, nameB, stop);
  855. final MutableInteger ptrout = new MutableInteger();
  856. long when;
  857. int tz;
  858. if (emailE < stop) {
  859. when = parseLongBase10(raw, emailE + 1, ptrout);
  860. tz = parseTimeZoneOffset(raw, ptrout.value);
  861. } else {
  862. when = 0;
  863. tz = 0;
  864. }
  865. return new PersonIdent(name, email, when * 1000L, tz);
  866. }
  867. /**
  868. * Locate the end of a footer line key string.
  869. * <p>
  870. * If the region at {@code raw[ptr]} matches {@code ^[A-Za-z0-9-]+:} (e.g.
  871. * "Signed-off-by: A. U. Thor\n") then this method returns the position of
  872. * the first ':'.
  873. * <p>
  874. * If the region at {@code raw[ptr]} does not match {@code ^[A-Za-z0-9-]+:}
  875. * then this method returns -1.
  876. *
  877. * @param raw
  878. * buffer to scan.
  879. * @param ptr
  880. * first position within raw to consider as a footer line key.
  881. * @return position of the ':' which terminates the footer line key if this
  882. * is otherwise a valid footer line key; otherwise -1.
  883. */
  884. public static int endOfFooterLineKey(final byte[] raw, int ptr) {
  885. try {
  886. for (;;) {
  887. final byte c = raw[ptr];
  888. if (footerLineKeyChars[c] == 0) {
  889. if (c == ':')
  890. return ptr;
  891. return -1;
  892. }
  893. ptr++;
  894. }
  895. } catch (ArrayIndexOutOfBoundsException e) {
  896. return -1;
  897. }
  898. }
  899. /**
  900. * Decode a buffer under UTF-8, if possible.
  901. *
  902. * If the byte stream cannot be decoded that way, the platform default is tried
  903. * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
  904. *
  905. * @param buffer
  906. * buffer to pull raw bytes from.
  907. * @return a string representation of the range <code>[start,end)</code>,
  908. * after decoding the region through the specified character set.
  909. */
  910. public static String decode(final byte[] buffer) {
  911. return decode(buffer, 0, buffer.length);
  912. }
  913. /**
  914. * Decode a buffer under UTF-8, if possible.
  915. *
  916. * If the byte stream cannot be decoded that way, the platform default is
  917. * tried and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
  918. *
  919. * @param buffer
  920. * buffer to pull raw bytes from.
  921. * @param start
  922. * start position in buffer
  923. * @param end
  924. * one position past the last location within the buffer to take
  925. * data from.
  926. * @return a string representation of the range <code>[start,end)</code>,
  927. * after decoding the region through the specified character set.
  928. */
  929. public static String decode(final byte[] buffer, final int start,
  930. final int end) {
  931. return decode(UTF_8, buffer, start, end);
  932. }
  933. /**
  934. * Decode a buffer under the specified character set if possible.
  935. *
  936. * If the byte stream cannot be decoded that way, the platform default is tried
  937. * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
  938. *
  939. * @param cs
  940. * character set to use when decoding the buffer.
  941. * @param buffer
  942. * buffer to pull raw bytes from.
  943. * @return a string representation of the range <code>[start,end)</code>,
  944. * after decoding the region through the specified character set.
  945. */
  946. public static String decode(final Charset cs, final byte[] buffer) {
  947. return decode(cs, buffer, 0, buffer.length);
  948. }
  949. /**
  950. * Decode a region of the buffer under the specified character set if possible.
  951. *
  952. * If the byte stream cannot be decoded that way, the platform default is tried
  953. * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
  954. *
  955. * @param cs
  956. * character set to use when decoding the buffer.
  957. * @param buffer
  958. * buffer to pull raw bytes from.
  959. * @param start
  960. * first position within the buffer to take data from.
  961. * @param end
  962. * one position past the last location within the buffer to take
  963. * data from.
  964. * @return a string representation of the range <code>[start,end)</code>,
  965. * after decoding the region through the specified character set.
  966. */
  967. public static String decode(final Charset cs, final byte[] buffer,
  968. final int start, final int end) {
  969. try {
  970. return decodeNoFallback(cs, buffer, start, end);
  971. } catch (CharacterCodingException e) {
  972. // Fall back to an ISO-8859-1 style encoding. At least all of
  973. // the bytes will be present in the output.
  974. //
  975. return extractBinaryString(buffer, start, end);
  976. }
  977. }
  978. /**
  979. * Decode a region of the buffer under the specified character set if
  980. * possible.
  981. *
  982. * If the byte stream cannot be decoded that way, the platform default is
  983. * tried and if that too fails, an exception is thrown.
  984. *
  985. * @param cs
  986. * character set to use when decoding the buffer.
  987. * @param buffer
  988. * buffer to pull raw bytes from.
  989. * @param start
  990. * first position within the buffer to take data from.
  991. * @param end
  992. * one position past the last location within the buffer to take
  993. * data from.
  994. * @return a string representation of the range <code>[start,end)</code>,
  995. * after decoding the region through the specified character set.
  996. * @throws java.nio.charset.CharacterCodingException
  997. * the input is not in any of the tested character sets.
  998. */
  999. public static String decodeNoFallback(final Charset cs,
  1000. final byte[] buffer, final int start, final int end)
  1001. throws CharacterCodingException {
  1002. ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start);
  1003. b.mark();
  1004. // Try our built-in favorite. The assumption here is that
  1005. // decoding will fail if the data is not actually encoded
  1006. // using that encoder.
  1007. try {
  1008. return decode(b, UTF_8);
  1009. } catch (CharacterCodingException e) {
  1010. b.reset();
  1011. }
  1012. if (!cs.equals(UTF_8)) {
  1013. // Try the suggested encoding, it might be right since it was
  1014. // provided by the caller.
  1015. try {
  1016. return decode(b, cs);
  1017. } catch (CharacterCodingException e) {
  1018. b.reset();
  1019. }
  1020. }
  1021. // Try the default character set. A small group of people
  1022. // might actually use the same (or very similar) locale.
  1023. Charset defcs = Charset.defaultCharset();
  1024. if (!defcs.equals(cs) && !defcs.equals(UTF_8)) {
  1025. try {
  1026. return decode(b, defcs);
  1027. } catch (CharacterCodingException e) {
  1028. b.reset();
  1029. }
  1030. }
  1031. throw new CharacterCodingException();
  1032. }
  1033. /**
  1034. * Decode a region of the buffer under the ISO-8859-1 encoding.
  1035. *
  1036. * Each byte is treated as a single character in the 8859-1 character
  1037. * encoding, performing a raw binary-&gt;char conversion.
  1038. *
  1039. * @param buffer
  1040. * buffer to pull raw bytes from.
  1041. * @param start
  1042. * first position within the buffer to take data from.
  1043. * @param end
  1044. * one position past the last location within the buffer to take
  1045. * data from.
  1046. * @return a string representation of the range <code>[start,end)</code>.
  1047. */
  1048. public static String extractBinaryString(final byte[] buffer,
  1049. final int start, final int end) {
  1050. final StringBuilder r = new StringBuilder(end - start);
  1051. for (int i = start; i < end; i++)
  1052. r.append((char) (buffer[i] & 0xff));
  1053. return r.toString();
  1054. }
  1055. private static String decode(final ByteBuffer b, final Charset charset)
  1056. throws CharacterCodingException {
  1057. final CharsetDecoder d = charset.newDecoder();
  1058. d.onMalformedInput(CodingErrorAction.REPORT);
  1059. d.onUnmappableCharacter(CodingErrorAction.REPORT);
  1060. return d.decode(b).toString();
  1061. }
  1062. /**
  1063. * Locate the position of the commit message body.
  1064. *
  1065. * @param b
  1066. * buffer to scan.
  1067. * @param ptr
  1068. * position in buffer to start the scan at. Most callers should
  1069. * pass 0 to ensure the scan starts from the beginning of the
  1070. * commit buffer.
  1071. * @return position of the user's message buffer.
  1072. */
  1073. public static final int commitMessage(final byte[] b, int ptr) {
  1074. final int sz = b.length;
  1075. if (ptr == 0)
  1076. ptr += 46; // skip the "tree ..." line.
  1077. while (ptr < sz && b[ptr] == 'p')
  1078. ptr += 48; // skip this parent.
  1079. // Skip any remaining header lines, ignoring what their actual
  1080. // header line type is. This is identical to the logic for a tag.
  1081. //
  1082. return tagMessage(b, ptr);
  1083. }
  1084. /**
  1085. * Locate the position of the tag message body.
  1086. *
  1087. * @param b
  1088. * buffer to scan.
  1089. * @param ptr
  1090. * position in buffer to start the scan at. Most callers should
  1091. * pass 0 to ensure the scan starts from the beginning of the tag
  1092. * buffer.
  1093. * @return position of the user's message buffer.
  1094. */
  1095. public static final int tagMessage(final byte[] b, int ptr) {
  1096. final int sz = b.length;
  1097. if (ptr == 0)
  1098. ptr += 48; // skip the "object ..." line.
  1099. while (ptr < sz && b[ptr] != '\n')
  1100. ptr = nextLF(b, ptr);
  1101. if (ptr < sz && b[ptr] == '\n')
  1102. return ptr + 1;
  1103. return -1;
  1104. }
  1105. /**
  1106. * Locate the end of a paragraph.
  1107. * <p>
  1108. * A paragraph is ended by two consecutive LF bytes or CRLF pairs
  1109. *
  1110. * @param b
  1111. * buffer to scan.
  1112. * @param start
  1113. * position in buffer to start the scan at. Most callers will
  1114. * want to pass the first position of the commit message (as
  1115. * found by {@link #commitMessage(byte[], int)}.
  1116. * @return position of the LF at the end of the paragraph;
  1117. * <code>b.length</code> if no paragraph end could be located.
  1118. */
  1119. public static final int endOfParagraph(final byte[] b, final int start) {
  1120. int ptr = start;
  1121. final int sz = b.length;
  1122. while (ptr < sz && (b[ptr] != '\n' && b[ptr] != '\r'))
  1123. ptr = nextLF(b, ptr);
  1124. if (ptr > start && b[ptr - 1] == '\n')
  1125. ptr--;
  1126. if (ptr > start && b[ptr - 1] == '\r')
  1127. ptr--;
  1128. return ptr;
  1129. }
  1130. /**
  1131. * Get last index of {@code ch} in raw, trimming spaces.
  1132. *
  1133. * @param raw
  1134. * buffer to scan.
  1135. * @param ch
  1136. * character to find.
  1137. * @param pos
  1138. * starting position.
  1139. * @return last index of {@code ch} in raw, trimming spaces.
  1140. * @since 4.1
  1141. */
  1142. public static int lastIndexOfTrim(byte[] raw, char ch, int pos) {
  1143. while (pos >= 0 && raw[pos] == ' ')
  1144. pos--;
  1145. while (pos >= 0 && raw[pos] != ch)
  1146. pos--;
  1147. return pos;
  1148. }
  1149. private static Charset charsetForAlias(String name) {
  1150. return encodingAliases.get(StringUtils.toLowerCase(name));
  1151. }
  1152. private RawParseUtils() {
  1153. // Don't create instances of a static only utility.
  1154. }
  1155. }