123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251 |
- /*
- * Copyright (C) 2008-2009, Google Inc.
- * Copyright (C) 2006-2008, Shawn O. Pearce <spearce@spearce.org>
- * and other copyright owners as documented in the project's IP log.
- *
- * This program and the accompanying materials are made available
- * under the terms of the Eclipse Distribution License v1.0 which
- * accompanies this distribution, is reproduced below, and is
- * available at http://www.eclipse.org/org/documents/edl-v10.php
- *
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
- *
- * - Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * - Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following
- * disclaimer in the documentation and/or other materials provided
- * with the distribution.
- *
- * - Neither the name of the Eclipse Foundation, Inc. nor the
- * names of its contributors may be used to endorse or promote
- * products derived from this software without specific prior
- * written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
- * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
- * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
- * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
- * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
- * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
- package org.eclipse.jgit.util;
-
- import static java.nio.charset.StandardCharsets.ISO_8859_1;
- import static java.nio.charset.StandardCharsets.UTF_8;
- import static org.eclipse.jgit.lib.ObjectChecker.author;
- import static org.eclipse.jgit.lib.ObjectChecker.committer;
- import static org.eclipse.jgit.lib.ObjectChecker.encoding;
- import static org.eclipse.jgit.lib.ObjectChecker.tagger;
-
- import java.nio.ByteBuffer;
- import java.nio.charset.CharacterCodingException;
- import java.nio.charset.Charset;
- import java.nio.charset.CharsetDecoder;
- import java.nio.charset.CodingErrorAction;
- import java.nio.charset.IllegalCharsetNameException;
- import java.nio.charset.UnsupportedCharsetException;
- import java.util.Arrays;
- import java.util.HashMap;
- import java.util.Map;
-
- import org.eclipse.jgit.annotations.Nullable;
- import org.eclipse.jgit.lib.Constants;
- import org.eclipse.jgit.lib.PersonIdent;
-
- /** Handy utility functions to parse raw object contents. */
- public final class RawParseUtils {
- /**
- * UTF-8 charset constant.
- *
- * @since 2.2
- */
- public static final Charset UTF8_CHARSET = UTF_8;
-
- private static final byte[] digits10;
-
- private static final byte[] digits16;
-
- private static final byte[] footerLineKeyChars;
-
- private static final Map<String, Charset> encodingAliases;
-
- static {
- encodingAliases = new HashMap<>();
- encodingAliases.put("latin-1", ISO_8859_1); //$NON-NLS-1$
- encodingAliases.put("iso-latin-1", ISO_8859_1); //$NON-NLS-1$
-
- digits10 = new byte['9' + 1];
- Arrays.fill(digits10, (byte) -1);
- for (char i = '0'; i <= '9'; i++)
- digits10[i] = (byte) (i - '0');
-
- digits16 = new byte['f' + 1];
- Arrays.fill(digits16, (byte) -1);
- for (char i = '0'; i <= '9'; i++)
- digits16[i] = (byte) (i - '0');
- for (char i = 'a'; i <= 'f'; i++)
- digits16[i] = (byte) ((i - 'a') + 10);
- for (char i = 'A'; i <= 'F'; i++)
- digits16[i] = (byte) ((i - 'A') + 10);
-
- footerLineKeyChars = new byte['z' + 1];
- footerLineKeyChars['-'] = 1;
- for (char i = '0'; i <= '9'; i++)
- footerLineKeyChars[i] = 1;
- for (char i = 'A'; i <= 'Z'; i++)
- footerLineKeyChars[i] = 1;
- for (char i = 'a'; i <= 'z'; i++)
- footerLineKeyChars[i] = 1;
- }
-
- /**
- * Determine if b[ptr] matches src.
- *
- * @param b
- * the buffer to scan.
- * @param ptr
- * first position within b, this should match src[0].
- * @param src
- * the buffer to test for equality with b.
- * @return ptr + src.length if b[ptr..src.length] == src; else -1.
- */
- public static final int match(final byte[] b, int ptr, final byte[] src) {
- if (ptr + src.length > b.length)
- return -1;
- for (int i = 0; i < src.length; i++, ptr++)
- if (b[ptr] != src[i])
- return -1;
- return ptr;
- }
-
- private static final byte[] base10byte = { '0', '1', '2', '3', '4', '5',
- '6', '7', '8', '9' };
-
- /**
- * Format a base 10 numeric into a temporary buffer.
- * <p>
- * Formatting is performed backwards. The method starts at offset
- * <code>o-1</code> and ends at <code>o-1-digits</code>, where
- * <code>digits</code> is the number of positions necessary to store the
- * base 10 value.
- * <p>
- * The argument and return values from this method make it easy to chain
- * writing, for example:
- * </p>
- *
- * <pre>
- * final byte[] tmp = new byte[64];
- * int ptr = tmp.length;
- * tmp[--ptr] = '\n';
- * ptr = RawParseUtils.formatBase10(tmp, ptr, 32);
- * tmp[--ptr] = ' ';
- * ptr = RawParseUtils.formatBase10(tmp, ptr, 18);
- * tmp[--ptr] = 0;
- * final String str = new String(tmp, ptr, tmp.length - ptr);
- * </pre>
- *
- * @param b
- * buffer to write into.
- * @param o
- * one offset past the location where writing will begin; writing
- * proceeds towards lower index values.
- * @param value
- * the value to store.
- * @return the new offset value <code>o</code>. This is the position of
- * the last byte written. Additional writing should start at one
- * position earlier.
- */
- public static int formatBase10(final byte[] b, int o, int value) {
- if (value == 0) {
- b[--o] = '0';
- return o;
- }
- final boolean isneg = value < 0;
- if (isneg)
- value = -value;
- while (value != 0) {
- b[--o] = base10byte[value % 10];
- value /= 10;
- }
- if (isneg)
- b[--o] = '-';
- return o;
- }
-
- /**
- * Parse a base 10 numeric from a sequence of ASCII digits into an int.
- * <p>
- * Digit sequences can begin with an optional run of spaces before the
- * sequence, and may start with a '+' or a '-' to indicate sign position.
- * Any other characters will cause the method to stop and return the current
- * result to the caller.
- *
- * @param b
- * buffer to scan.
- * @param ptr
- * position within buffer to start parsing digits at.
- * @param ptrResult
- * optional location to return the new ptr value through. If null
- * the ptr value will be discarded.
- * @return the value at this location; 0 if the location is not a valid
- * numeric.
- */
- public static final int parseBase10(final byte[] b, int ptr,
- final MutableInteger ptrResult) {
- int r = 0;
- int sign = 0;
- try {
- final int sz = b.length;
- while (ptr < sz && b[ptr] == ' ')
- ptr++;
- if (ptr >= sz)
- return 0;
-
- switch (b[ptr]) {
- case '-':
- sign = -1;
- ptr++;
- break;
- case '+':
- ptr++;
- break;
- }
-
- while (ptr < sz) {
- final byte v = digits10[b[ptr]];
- if (v < 0)
- break;
- r = (r * 10) + v;
- ptr++;
- }
- } catch (ArrayIndexOutOfBoundsException e) {
- // Not a valid digit.
- }
- if (ptrResult != null)
- ptrResult.value = ptr;
- return sign < 0 ? -r : r;
- }
-
- /**
- * Parse a base 10 numeric from a sequence of ASCII digits into a long.
- * <p>
- * Digit sequences can begin with an optional run of spaces before the
- * sequence, and may start with a '+' or a '-' to indicate sign position.
- * Any other characters will cause the method to stop and return the current
- * result to the caller.
- *
- * @param b
- * buffer to scan.
- * @param ptr
- * position within buffer to start parsing digits at.
- * @param ptrResult
- * optional location to return the new ptr value through. If null
- * the ptr value will be discarded.
- * @return the value at this location; 0 if the location is not a valid
- * numeric.
- */
- public static final long parseLongBase10(final byte[] b, int ptr,
- final MutableInteger ptrResult) {
- long r = 0;
- int sign = 0;
- try {
- final int sz = b.length;
- while (ptr < sz && b[ptr] == ' ')
- ptr++;
- if (ptr >= sz)
- return 0;
-
- switch (b[ptr]) {
- case '-':
- sign = -1;
- ptr++;
- break;
- case '+':
- ptr++;
- break;
- }
-
- while (ptr < sz) {
- final byte v = digits10[b[ptr]];
- if (v < 0)
- break;
- r = (r * 10) + v;
- ptr++;
- }
- } catch (ArrayIndexOutOfBoundsException e) {
- // Not a valid digit.
- }
- if (ptrResult != null)
- ptrResult.value = ptr;
- return sign < 0 ? -r : r;
- }
-
- /**
- * Parse 4 character base 16 (hex) formatted string to unsigned integer.
- * <p>
- * The number is read in network byte order, that is, most significant
- * nybble first.
- *
- * @param bs
- * buffer to parse digits from; positions {@code [p, p+4)} will
- * be parsed.
- * @param p
- * first position within the buffer to parse.
- * @return the integer value.
- * @throws ArrayIndexOutOfBoundsException
- * if the string is not hex formatted.
- */
- public static final int parseHexInt16(final byte[] bs, final int p) {
- int r = digits16[bs[p]] << 4;
-
- r |= digits16[bs[p + 1]];
- r <<= 4;
-
- r |= digits16[bs[p + 2]];
- r <<= 4;
-
- r |= digits16[bs[p + 3]];
- if (r < 0)
- throw new ArrayIndexOutOfBoundsException();
- return r;
- }
-
- /**
- * Parse 8 character base 16 (hex) formatted string to unsigned integer.
- * <p>
- * The number is read in network byte order, that is, most significant
- * nybble first.
- *
- * @param bs
- * buffer to parse digits from; positions {@code [p, p+8)} will
- * be parsed.
- * @param p
- * first position within the buffer to parse.
- * @return the integer value.
- * @throws ArrayIndexOutOfBoundsException
- * if the string is not hex formatted.
- */
- public static final int parseHexInt32(final byte[] bs, final int p) {
- int r = digits16[bs[p]] << 4;
-
- r |= digits16[bs[p + 1]];
- r <<= 4;
-
- r |= digits16[bs[p + 2]];
- r <<= 4;
-
- r |= digits16[bs[p + 3]];
- r <<= 4;
-
- r |= digits16[bs[p + 4]];
- r <<= 4;
-
- r |= digits16[bs[p + 5]];
- r <<= 4;
-
- r |= digits16[bs[p + 6]];
-
- final int last = digits16[bs[p + 7]];
- if (r < 0 || last < 0)
- throw new ArrayIndexOutOfBoundsException();
- return (r << 4) | last;
- }
-
- /**
- * Parse 16 character base 16 (hex) formatted string to unsigned long.
- * <p>
- * The number is read in network byte order, that is, most significant
- * nibble first.
- *
- * @param bs
- * buffer to parse digits from; positions {@code [p, p+16)} will
- * be parsed.
- * @param p
- * first position within the buffer to parse.
- * @return the integer value.
- * @throws ArrayIndexOutOfBoundsException
- * if the string is not hex formatted.
- * @since 4.3
- */
- public static final long parseHexInt64(final byte[] bs, final int p) {
- long r = digits16[bs[p]] << 4;
-
- r |= digits16[bs[p + 1]];
- r <<= 4;
-
- r |= digits16[bs[p + 2]];
- r <<= 4;
-
- r |= digits16[bs[p + 3]];
- r <<= 4;
-
- r |= digits16[bs[p + 4]];
- r <<= 4;
-
- r |= digits16[bs[p + 5]];
- r <<= 4;
-
- r |= digits16[bs[p + 6]];
- r <<= 4;
-
- r |= digits16[bs[p + 7]];
- r <<= 4;
-
- r |= digits16[bs[p + 8]];
- r <<= 4;
-
- r |= digits16[bs[p + 9]];
- r <<= 4;
-
- r |= digits16[bs[p + 10]];
- r <<= 4;
-
- r |= digits16[bs[p + 11]];
- r <<= 4;
-
- r |= digits16[bs[p + 12]];
- r <<= 4;
-
- r |= digits16[bs[p + 13]];
- r <<= 4;
-
- r |= digits16[bs[p + 14]];
-
- final int last = digits16[bs[p + 15]];
- if (r < 0 || last < 0)
- throw new ArrayIndexOutOfBoundsException();
- return (r << 4) | last;
- }
-
- /**
- * Parse a single hex digit to its numeric value (0-15).
- *
- * @param digit
- * hex character to parse.
- * @return numeric value, in the range 0-15.
- * @throws ArrayIndexOutOfBoundsException
- * if the input digit is not a valid hex digit.
- */
- public static final int parseHexInt4(final byte digit) {
- final byte r = digits16[digit];
- if (r < 0)
- throw new ArrayIndexOutOfBoundsException();
- return r;
- }
-
- /**
- * Parse a Git style timezone string.
- * <p>
- * The sequence "-0315" will be parsed as the numeric value -195, as the
- * lower two positions count minutes, not 100ths of an hour.
- *
- * @param b
- * buffer to scan.
- * @param ptr
- * position within buffer to start parsing digits at.
- * @return the timezone at this location, expressed in minutes.
- */
- public static final int parseTimeZoneOffset(final byte[] b, int ptr) {
- return parseTimeZoneOffset(b, ptr, null);
- }
-
- /**
- * Parse a Git style timezone string.
- * <p>
- * The sequence "-0315" will be parsed as the numeric value -195, as the
- * lower two positions count minutes, not 100ths of an hour.
- *
- * @param b
- * buffer to scan.
- * @param ptr
- * position within buffer to start parsing digits at.
- * @param ptrResult
- * optional location to return the new ptr value through. If null
- * the ptr value will be discarded.
- * @return the timezone at this location, expressed in minutes.
- * @since 4.1
- */
- public static final int parseTimeZoneOffset(final byte[] b, int ptr,
- MutableInteger ptrResult) {
- final int v = parseBase10(b, ptr, ptrResult);
- final int tzMins = v % 100;
- final int tzHours = v / 100;
- return tzHours * 60 + tzMins;
- }
-
- /**
- * Locate the first position after a given character.
- *
- * @param b
- * buffer to scan.
- * @param ptr
- * position within buffer to start looking for chrA at.
- * @param chrA
- * character to find.
- * @return new position just after chrA.
- */
- public static final int next(final byte[] b, int ptr, final char chrA) {
- final int sz = b.length;
- while (ptr < sz) {
- if (b[ptr++] == chrA)
- return ptr;
- }
- return ptr;
- }
-
- /**
- * Locate the first position after the next LF.
- * <p>
- * This method stops on the first '\n' it finds.
- *
- * @param b
- * buffer to scan.
- * @param ptr
- * position within buffer to start looking for LF at.
- * @return new position just after the first LF found.
- */
- public static final int nextLF(final byte[] b, int ptr) {
- return next(b, ptr, '\n');
- }
-
- /**
- * Locate the first position after either the given character or LF.
- * <p>
- * This method stops on the first match it finds from either chrA or '\n'.
- *
- * @param b
- * buffer to scan.
- * @param ptr
- * position within buffer to start looking for chrA or LF at.
- * @param chrA
- * character to find.
- * @return new position just after the first chrA or LF to be found.
- */
- public static final int nextLF(final byte[] b, int ptr, final char chrA) {
- final int sz = b.length;
- while (ptr < sz) {
- final byte c = b[ptr++];
- if (c == chrA || c == '\n')
- return ptr;
- }
- return ptr;
- }
-
- /**
- * Locate the first position before a given character.
- *
- * @param b
- * buffer to scan.
- * @param ptr
- * position within buffer to start looking for chrA at.
- * @param chrA
- * character to find.
- * @return new position just before chrA, -1 for not found
- */
- public static final int prev(final byte[] b, int ptr, final char chrA) {
- if (ptr == b.length)
- --ptr;
- while (ptr >= 0) {
- if (b[ptr--] == chrA)
- return ptr;
- }
- return ptr;
- }
-
- /**
- * Locate the first position before the previous LF.
- * <p>
- * This method stops on the first '\n' it finds.
- *
- * @param b
- * buffer to scan.
- * @param ptr
- * position within buffer to start looking for LF at.
- * @return new position just before the first LF found, -1 for not found
- */
- public static final int prevLF(final byte[] b, int ptr) {
- return prev(b, ptr, '\n');
- }
-
- /**
- * Locate the previous position before either the given character or LF.
- * <p>
- * This method stops on the first match it finds from either chrA or '\n'.
- *
- * @param b
- * buffer to scan.
- * @param ptr
- * position within buffer to start looking for chrA or LF at.
- * @param chrA
- * character to find.
- * @return new position just before the first chrA or LF to be found, -1 for
- * not found
- */
- public static final int prevLF(final byte[] b, int ptr, final char chrA) {
- if (ptr == b.length)
- --ptr;
- while (ptr >= 0) {
- final byte c = b[ptr--];
- if (c == chrA || c == '\n')
- return ptr;
- }
- return ptr;
- }
-
- /**
- * Index the region between <code>[ptr, end)</code> to find line starts.
- * <p>
- * The returned list is 1 indexed. Index 0 contains
- * {@link Integer#MIN_VALUE} to pad the list out.
- * <p>
- * Using a 1 indexed list means that line numbers can be directly accessed
- * from the list, so <code>list.get(1)</code> (aka get line 1) returns
- * <code>ptr</code>.
- * <p>
- * The last element (index <code>map.size()-1</code>) always contains
- * <code>end</code>.
- * <p>
- * If the data contains a '\0' anywhere, the whole region is considered binary
- * and a LineMap corresponding to a single line is returned.
- * </p>
- *
- * @param buf
- * buffer to scan.
- * @param ptr
- * position within the buffer corresponding to the first byte of
- * line 1.
- * @param end
- * 1 past the end of the content within <code>buf</code>.
- * @return a line map indexing the start position of each line.
- */
- public static final IntList lineMap(final byte[] buf, int ptr, int end) {
- int start = ptr;
-
- // Experimentally derived from multiple source repositories
- // the average number of bytes/line is 36. Its a rough guess
- // to initially size our map close to the target.
- IntList map = new IntList((end - ptr) / 36);
- map.add(Integer.MIN_VALUE);
- boolean foundLF = true;
- for (; ptr < end; ptr++) {
- if (foundLF) {
- map.add(ptr);
- }
-
- if (buf[ptr] == '\0') {
- // binary data.
- map = new IntList(3);
- map.add(Integer.MIN_VALUE);
- map.add(start);
- break;
- }
-
- foundLF = (buf[ptr] == '\n');
- }
- map.add(end);
- return map;
- }
-
- /**
- * Locate the "author " header line data.
- *
- * @param b
- * buffer to scan.
- * @param ptr
- * position in buffer to start the scan at. Most callers should
- * pass 0 to ensure the scan starts from the beginning of the
- * commit buffer and does not accidentally look at message body.
- * @return position just after the space in "author ", so the first
- * character of the author's name. If no author header can be
- * located -1 is returned.
- */
- public static final int author(final byte[] b, int ptr) {
- final int sz = b.length;
- if (ptr == 0)
- ptr += 46; // skip the "tree ..." line.
- while (ptr < sz && b[ptr] == 'p')
- ptr += 48; // skip this parent.
- return match(b, ptr, author);
- }
-
- /**
- * Locate the "committer " header line data.
- *
- * @param b
- * buffer to scan.
- * @param ptr
- * position in buffer to start the scan at. Most callers should
- * pass 0 to ensure the scan starts from the beginning of the
- * commit buffer and does not accidentally look at message body.
- * @return position just after the space in "committer ", so the first
- * character of the committer's name. If no committer header can be
- * located -1 is returned.
- */
- public static final int committer(final byte[] b, int ptr) {
- final int sz = b.length;
- if (ptr == 0)
- ptr += 46; // skip the "tree ..." line.
- while (ptr < sz && b[ptr] == 'p')
- ptr += 48; // skip this parent.
- if (ptr < sz && b[ptr] == 'a')
- ptr = nextLF(b, ptr);
- return match(b, ptr, committer);
- }
-
- /**
- * Locate the "tagger " header line data.
- *
- * @param b
- * buffer to scan.
- * @param ptr
- * position in buffer to start the scan at. Most callers should
- * pass 0 to ensure the scan starts from the beginning of the tag
- * buffer and does not accidentally look at message body.
- * @return position just after the space in "tagger ", so the first
- * character of the tagger's name. If no tagger header can be
- * located -1 is returned.
- */
- public static final int tagger(final byte[] b, int ptr) {
- final int sz = b.length;
- if (ptr == 0)
- ptr += 48; // skip the "object ..." line.
- while (ptr < sz) {
- if (b[ptr] == '\n')
- return -1;
- final int m = match(b, ptr, tagger);
- if (m >= 0)
- return m;
- ptr = nextLF(b, ptr);
- }
- return -1;
- }
-
- /**
- * Locate the "encoding " header line.
- *
- * @param b
- * buffer to scan.
- * @param ptr
- * position in buffer to start the scan at. Most callers should
- * pass 0 to ensure the scan starts from the beginning of the
- * buffer and does not accidentally look at the message body.
- * @return position just after the space in "encoding ", so the first
- * character of the encoding's name. If no encoding header can be
- * located -1 is returned (and UTF-8 should be assumed).
- */
- public static final int encoding(final byte[] b, int ptr) {
- final int sz = b.length;
- while (ptr < sz) {
- if (b[ptr] == '\n')
- return -1;
- if (b[ptr] == 'e')
- break;
- ptr = nextLF(b, ptr);
- }
- return match(b, ptr, encoding);
- }
-
- /**
- * Parse the "encoding " header as a string.
- * <p>
- * Locates the "encoding " header (if present) and returns its value.
- *
- * @param b
- * buffer to scan.
- * @return the encoding header as specified in the commit; null if the
- * header was not present and should be assumed.
- * @since 4.2
- */
- @Nullable
- public static String parseEncodingName(final byte[] b) {
- int enc = encoding(b, 0);
- if (enc < 0) {
- return null;
- }
- int lf = nextLF(b, enc);
- return decode(UTF_8, b, enc, lf - 1);
- }
-
- /**
- * Parse the "encoding " header into a character set reference.
- * <p>
- * Locates the "encoding " header (if present) by first calling
- * {@link #encoding(byte[], int)} and then returns the proper character set
- * to apply to this buffer to evaluate its contents as character data.
- * <p>
- * If no encoding header is present {@code UTF-8} is assumed.
- *
- * @param b
- * buffer to scan.
- * @return the Java character set representation. Never null.
- * @throws IllegalCharsetNameException
- * if the character set requested by the encoding header is
- * malformed and unsupportable.
- * @throws UnsupportedCharsetException
- * if the JRE does not support the character set requested by
- * the encoding header.
- */
- public static Charset parseEncoding(final byte[] b) {
- String enc = parseEncodingName(b);
- if (enc == null) {
- return UTF_8;
- }
-
- String name = enc.trim();
- try {
- return Charset.forName(name);
- } catch (IllegalCharsetNameException
- | UnsupportedCharsetException badName) {
- Charset aliased = charsetForAlias(name);
- if (aliased != null) {
- return aliased;
- }
- throw badName;
- }
- }
-
- /**
- * Parse a name string (e.g. author, committer, tagger) into a PersonIdent.
- * <p>
- * Leading spaces won't be trimmed from the string, i.e. will show up in the
- * parsed name afterwards.
- *
- * @param in
- * the string to parse a name from.
- * @return the parsed identity or null in case the identity could not be
- * parsed.
- */
- public static PersonIdent parsePersonIdent(final String in) {
- return parsePersonIdent(Constants.encode(in), 0);
- }
-
- /**
- * Parse a name line (e.g. author, committer, tagger) into a PersonIdent.
- * <p>
- * When passing in a value for <code>nameB</code> callers should use the
- * return value of {@link #author(byte[], int)} or
- * {@link #committer(byte[], int)}, as these methods provide the proper
- * position within the buffer.
- *
- * @param raw
- * the buffer to parse character data from.
- * @param nameB
- * first position of the identity information. This should be the
- * first position after the space which delimits the header field
- * name (e.g. "author" or "committer") from the rest of the
- * identity line.
- * @return the parsed identity or null in case the identity could not be
- * parsed.
- */
- public static PersonIdent parsePersonIdent(final byte[] raw, final int nameB) {
- Charset cs;
- try {
- cs = parseEncoding(raw);
- } catch (IllegalCharsetNameException | UnsupportedCharsetException e) {
- // Assume UTF-8 for person identities, usually this is correct.
- // If not decode() will fall back to the ISO-8859-1 encoding.
- cs = UTF_8;
- }
-
- final int emailB = nextLF(raw, nameB, '<');
- final int emailE = nextLF(raw, emailB, '>');
- if (emailB >= raw.length || raw[emailB] == '\n' ||
- (emailE >= raw.length - 1 && raw[emailE - 1] != '>'))
- return null;
-
- final int nameEnd = emailB - 2 >= nameB && raw[emailB - 2] == ' ' ?
- emailB - 2 : emailB - 1;
- final String name = decode(cs, raw, nameB, nameEnd);
- final String email = decode(cs, raw, emailB, emailE - 1);
-
- // Start searching from end of line, as after first name-email pair,
- // another name-email pair may occur. We will ignore all kinds of
- // "junk" following the first email.
- //
- // We've to use (emailE - 1) for the case that raw[email] is LF,
- // otherwise we would run too far. "-2" is necessary to position
- // before the LF in case of LF termination resp. the penultimate
- // character if there is no trailing LF.
- final int tzBegin = lastIndexOfTrim(raw, ' ',
- nextLF(raw, emailE - 1) - 2) + 1;
- if (tzBegin <= emailE) // No time/zone, still valid
- return new PersonIdent(name, email, 0, 0);
-
- final int whenBegin = Math.max(emailE,
- lastIndexOfTrim(raw, ' ', tzBegin - 1) + 1);
- if (whenBegin >= tzBegin - 1) // No time/zone, still valid
- return new PersonIdent(name, email, 0, 0);
-
- final long when = parseLongBase10(raw, whenBegin, null);
- final int tz = parseTimeZoneOffset(raw, tzBegin);
- return new PersonIdent(name, email, when * 1000L, tz);
- }
-
- /**
- * Parse a name data (e.g. as within a reflog) into a PersonIdent.
- * <p>
- * When passing in a value for <code>nameB</code> callers should use the
- * return value of {@link #author(byte[], int)} or
- * {@link #committer(byte[], int)}, as these methods provide the proper
- * position within the buffer.
- *
- * @param raw
- * the buffer to parse character data from.
- * @param nameB
- * first position of the identity information. This should be the
- * first position after the space which delimits the header field
- * name (e.g. "author" or "committer") from the rest of the
- * identity line.
- * @return the parsed identity. Never null.
- */
- public static PersonIdent parsePersonIdentOnly(final byte[] raw,
- final int nameB) {
- int stop = nextLF(raw, nameB);
- int emailB = nextLF(raw, nameB, '<');
- int emailE = nextLF(raw, emailB, '>');
- final String name;
- final String email;
- if (emailE < stop) {
- email = decode(raw, emailB, emailE - 1);
- } else {
- email = "invalid"; //$NON-NLS-1$
- }
- if (emailB < stop)
- name = decode(raw, nameB, emailB - 2);
- else
- name = decode(raw, nameB, stop);
-
- final MutableInteger ptrout = new MutableInteger();
- long when;
- int tz;
- if (emailE < stop) {
- when = parseLongBase10(raw, emailE + 1, ptrout);
- tz = parseTimeZoneOffset(raw, ptrout.value);
- } else {
- when = 0;
- tz = 0;
- }
- return new PersonIdent(name, email, when * 1000L, tz);
- }
-
- /**
- * Locate the end of a footer line key string.
- * <p>
- * If the region at {@code raw[ptr]} matches {@code ^[A-Za-z0-9-]+:} (e.g.
- * "Signed-off-by: A. U. Thor\n") then this method returns the position of
- * the first ':'.
- * <p>
- * If the region at {@code raw[ptr]} does not match {@code ^[A-Za-z0-9-]+:}
- * then this method returns -1.
- *
- * @param raw
- * buffer to scan.
- * @param ptr
- * first position within raw to consider as a footer line key.
- * @return position of the ':' which terminates the footer line key if this
- * is otherwise a valid footer line key; otherwise -1.
- */
- public static int endOfFooterLineKey(final byte[] raw, int ptr) {
- try {
- for (;;) {
- final byte c = raw[ptr];
- if (footerLineKeyChars[c] == 0) {
- if (c == ':')
- return ptr;
- return -1;
- }
- ptr++;
- }
- } catch (ArrayIndexOutOfBoundsException e) {
- return -1;
- }
- }
-
- /**
- * Decode a buffer under UTF-8, if possible.
- *
- * If the byte stream cannot be decoded that way, the platform default is tried
- * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
- *
- * @param buffer
- * buffer to pull raw bytes from.
- * @return a string representation of the range <code>[start,end)</code>,
- * after decoding the region through the specified character set.
- */
- public static String decode(final byte[] buffer) {
- return decode(buffer, 0, buffer.length);
- }
-
- /**
- * Decode a buffer under UTF-8, if possible.
- *
- * If the byte stream cannot be decoded that way, the platform default is
- * tried and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
- *
- * @param buffer
- * buffer to pull raw bytes from.
- * @param start
- * start position in buffer
- * @param end
- * one position past the last location within the buffer to take
- * data from.
- * @return a string representation of the range <code>[start,end)</code>,
- * after decoding the region through the specified character set.
- */
- public static String decode(final byte[] buffer, final int start,
- final int end) {
- return decode(UTF_8, buffer, start, end);
- }
-
- /**
- * Decode a buffer under the specified character set if possible.
- *
- * If the byte stream cannot be decoded that way, the platform default is tried
- * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
- *
- * @param cs
- * character set to use when decoding the buffer.
- * @param buffer
- * buffer to pull raw bytes from.
- * @return a string representation of the range <code>[start,end)</code>,
- * after decoding the region through the specified character set.
- */
- public static String decode(final Charset cs, final byte[] buffer) {
- return decode(cs, buffer, 0, buffer.length);
- }
-
- /**
- * Decode a region of the buffer under the specified character set if possible.
- *
- * If the byte stream cannot be decoded that way, the platform default is tried
- * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
- *
- * @param cs
- * character set to use when decoding the buffer.
- * @param buffer
- * buffer to pull raw bytes from.
- * @param start
- * first position within the buffer to take data from.
- * @param end
- * one position past the last location within the buffer to take
- * data from.
- * @return a string representation of the range <code>[start,end)</code>,
- * after decoding the region through the specified character set.
- */
- public static String decode(final Charset cs, final byte[] buffer,
- final int start, final int end) {
- try {
- return decodeNoFallback(cs, buffer, start, end);
- } catch (CharacterCodingException e) {
- // Fall back to an ISO-8859-1 style encoding. At least all of
- // the bytes will be present in the output.
- //
- return extractBinaryString(buffer, start, end);
- }
- }
-
- /**
- * Decode a region of the buffer under the specified character set if
- * possible.
- *
- * If the byte stream cannot be decoded that way, the platform default is
- * tried and if that too fails, an exception is thrown.
- *
- * @param cs
- * character set to use when decoding the buffer.
- * @param buffer
- * buffer to pull raw bytes from.
- * @param start
- * first position within the buffer to take data from.
- * @param end
- * one position past the last location within the buffer to take
- * data from.
- * @return a string representation of the range <code>[start,end)</code>,
- * after decoding the region through the specified character set.
- * @throws CharacterCodingException
- * the input is not in any of the tested character sets.
- */
- public static String decodeNoFallback(final Charset cs,
- final byte[] buffer, final int start, final int end)
- throws CharacterCodingException {
- ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start);
- b.mark();
-
- // Try our built-in favorite. The assumption here is that
- // decoding will fail if the data is not actually encoded
- // using that encoder.
- try {
- return decode(b, UTF_8);
- } catch (CharacterCodingException e) {
- b.reset();
- }
-
- if (!cs.equals(UTF_8)) {
- // Try the suggested encoding, it might be right since it was
- // provided by the caller.
- try {
- return decode(b, cs);
- } catch (CharacterCodingException e) {
- b.reset();
- }
- }
-
- // Try the default character set. A small group of people
- // might actually use the same (or very similar) locale.
- Charset defcs = Charset.defaultCharset();
- if (!defcs.equals(cs) && !defcs.equals(UTF_8)) {
- try {
- return decode(b, defcs);
- } catch (CharacterCodingException e) {
- b.reset();
- }
- }
-
- throw new CharacterCodingException();
- }
-
- /**
- * Decode a region of the buffer under the ISO-8859-1 encoding.
- *
- * Each byte is treated as a single character in the 8859-1 character
- * encoding, performing a raw binary->char conversion.
- *
- * @param buffer
- * buffer to pull raw bytes from.
- * @param start
- * first position within the buffer to take data from.
- * @param end
- * one position past the last location within the buffer to take
- * data from.
- * @return a string representation of the range <code>[start,end)</code>.
- */
- public static String extractBinaryString(final byte[] buffer,
- final int start, final int end) {
- final StringBuilder r = new StringBuilder(end - start);
- for (int i = start; i < end; i++)
- r.append((char) (buffer[i] & 0xff));
- return r.toString();
- }
-
- private static String decode(final ByteBuffer b, final Charset charset)
- throws CharacterCodingException {
- final CharsetDecoder d = charset.newDecoder();
- d.onMalformedInput(CodingErrorAction.REPORT);
- d.onUnmappableCharacter(CodingErrorAction.REPORT);
- return d.decode(b).toString();
- }
-
- /**
- * Locate the position of the commit message body.
- *
- * @param b
- * buffer to scan.
- * @param ptr
- * position in buffer to start the scan at. Most callers should
- * pass 0 to ensure the scan starts from the beginning of the
- * commit buffer.
- * @return position of the user's message buffer.
- */
- public static final int commitMessage(final byte[] b, int ptr) {
- final int sz = b.length;
- if (ptr == 0)
- ptr += 46; // skip the "tree ..." line.
- while (ptr < sz && b[ptr] == 'p')
- ptr += 48; // skip this parent.
-
- // Skip any remaining header lines, ignoring what their actual
- // header line type is. This is identical to the logic for a tag.
- //
- return tagMessage(b, ptr);
- }
-
- /**
- * Locate the position of the tag message body.
- *
- * @param b
- * buffer to scan.
- * @param ptr
- * position in buffer to start the scan at. Most callers should
- * pass 0 to ensure the scan starts from the beginning of the tag
- * buffer.
- * @return position of the user's message buffer.
- */
- public static final int tagMessage(final byte[] b, int ptr) {
- final int sz = b.length;
- if (ptr == 0)
- ptr += 48; // skip the "object ..." line.
- while (ptr < sz && b[ptr] != '\n')
- ptr = nextLF(b, ptr);
- if (ptr < sz && b[ptr] == '\n')
- return ptr + 1;
- return -1;
- }
-
- /**
- * Locate the end of a paragraph.
- * <p>
- * A paragraph is ended by two consecutive LF bytes or CRLF pairs
- *
- * @param b
- * buffer to scan.
- * @param start
- * position in buffer to start the scan at. Most callers will
- * want to pass the first position of the commit message (as
- * found by {@link #commitMessage(byte[], int)}.
- * @return position of the LF at the end of the paragraph;
- * <code>b.length</code> if no paragraph end could be located.
- */
- public static final int endOfParagraph(final byte[] b, final int start) {
- int ptr = start;
- final int sz = b.length;
- while (ptr < sz && (b[ptr] != '\n' && b[ptr] != '\r'))
- ptr = nextLF(b, ptr);
- if (ptr > start && b[ptr - 1] == '\n')
- ptr--;
- if (ptr > start && b[ptr - 1] == '\r')
- ptr--;
- return ptr;
- }
-
- /**
- * @param raw
- * buffer to scan.
- * @param ch
- * character to find.
- * @param pos
- * starting position.
- * @return last index of ch in raw, trimming spaces.
- * @since 4.1
- */
- public static int lastIndexOfTrim(byte[] raw, char ch, int pos) {
- while (pos >= 0 && raw[pos] == ' ')
- pos--;
-
- while (pos >= 0 && raw[pos] != ch)
- pos--;
-
- return pos;
- }
-
- private static Charset charsetForAlias(String name) {
- return encodingAliases.get(StringUtils.toLowerCase(name));
- }
-
- private RawParseUtils() {
- // Don't create instances of a static only utility.
- }
- }
|