diff options
author | Ivan Frade <ifrade@google.com> | 2025-01-23 15:49:30 -0800 |
---|---|---|
committer | Ivan Frade <ifrade@google.com> | 2025-02-12 12:04:44 -0800 |
commit | 072e93fded7229f00002232b33ed91a0ef614ddb (patch) | |
tree | be09481a186af4e3a55025f221a8306dae3c74d4 | |
parent | b3230f25e57858958f4fc1d01070dfc0bf08370e (diff) | |
download | jgit-072e93fded7229f00002232b33ed91a0ef614ddb.tar.gz jgit-072e93fded7229f00002232b33ed91a0ef614ddb.zip |
midx.MultiPackIndexWriter: a writer for the multipack index format
Fromg git documentation[1]: While the pack-indexes provide fast lookup
per packfile, this performance degrades as the number of packfiles
increases, because abbreviations need to inspect every packfile and we
are more likely to have a miss on our most-recently-used packfile. For
some large repositories, repacking into a single packfile is not
feasible due to storage space or excessive repack times. (...)
The multi-pack-index (MIDX for short) stores a list of objects and
their offsets into multiple packfiles. (...) Thus, we can provide
O(log N) lookup time for any number of packfiles.
This is a writer of the multipack index format. The test only verifies
the "shape" of the file, when we get a parser we can check also the
values (specially the large offset handling).
On the JGit repository, the multipack index generated by this writer
passes the validation of `git multi-pack-index verify`.
[1] https://git-scm.com/docs/pack-format#_multi_pack_index_midx_files_have_the_following_format
Change-Id: I1fca599c4ebf28154f28f039c2c4cfe75b2dc79d
6 files changed, 852 insertions, 0 deletions
diff --git a/org.eclipse.jgit.test/exttst/org/eclipse/jgit/internal/storage/midx/CgitMidxCompatibilityTest.java b/org.eclipse.jgit.test/exttst/org/eclipse/jgit/internal/storage/midx/CgitMidxCompatibilityTest.java new file mode 100644 index 0000000000..bbff7d433f --- /dev/null +++ b/org.eclipse.jgit.test/exttst/org/eclipse/jgit/internal/storage/midx/CgitMidxCompatibilityTest.java @@ -0,0 +1,210 @@ +/* + * Copyright (C) 2025, Google Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Distribution License v. 1.0 which is available at + * https://www.eclipse.org/org/documents/edl-v10.php. + * + * SPDX-License-Identifier: BSD-3-Clause + */ +package org.eclipse.jgit.internal.storage.midx; + +import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.CHUNK_LOOKUP_WIDTH; +import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.MIDX_CHUNKID_OBJECTOFFSETS; +import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.MIDX_CHUNKID_OIDFANOUT; +import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.MIDX_CHUNKID_OIDLOOKUP; +import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.MIDX_CHUNKID_PACKNAMES; +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.eclipse.jgit.internal.storage.file.ObjectDirectory; +import org.eclipse.jgit.internal.storage.file.Pack; +import org.eclipse.jgit.internal.storage.file.PackFile; +import org.eclipse.jgit.internal.storage.file.PackIndex; +import org.eclipse.jgit.internal.storage.pack.PackExt; +import org.eclipse.jgit.lib.NullProgressMonitor; +import org.eclipse.jgit.test.resources.SampleDataRepositoryTestCase; +import org.eclipse.jgit.util.NB; +import org.junit.Test; + +public class CgitMidxCompatibilityTest extends SampleDataRepositoryTestCase { + + @Test + public void jgitMidx_verifyByCgit() + throws IOException, InterruptedException { + byte[] jgitMidxBytes = generateJGitMidx(); + writeMidx(jgitMidxBytes); + assertEquals("cgit exit code", 0, run_cgit_multipackindex_verify()); + } + + @Test + public void compareBasicChunkSizes() + throws IOException, InterruptedException { + // We cannot compare byte-by-byte because there are optional chunks and + // it is not guaranteed what cgit and jgit will generate + byte[] jgitMidxBytes = generateJGitMidx(); + assertEquals("cgit exit code", 0, run_cgit_multipackindex_write()); + byte[] cgitMidxBytes = readCgitMidx(); + + RawMultiPackIndex jgitMidx = new RawMultiPackIndex(jgitMidxBytes); + RawMultiPackIndex cgitMidx = new RawMultiPackIndex(cgitMidxBytes); + + // This is a fixed sized chunk + assertEquals(256 * 4, cgitMidx.getChunkSize(MIDX_CHUNKID_OIDFANOUT)); + assertArrayEquals(cgitMidx.getRawChunk(MIDX_CHUNKID_OIDFANOUT), + jgitMidx.getRawChunk(MIDX_CHUNKID_OIDFANOUT)); + + assertArrayEquals(cgitMidx.getRawChunk(MIDX_CHUNKID_OIDLOOKUP), + jgitMidx.getRawChunk(MIDX_CHUNKID_OIDLOOKUP)); + + // The spec has changed from padding packnames to a multile of four, to + // move the packname chunk to the end of the file. + // git 2.48 pads the packs names to a multiple of 4 + // jgit puts the chunk at the end + byte[] cgitPacknames = trimPadding( + cgitMidx.getRawChunk(MIDX_CHUNKID_PACKNAMES)); + assertArrayEquals(cgitPacknames, + jgitMidx.getRawChunk(MIDX_CHUNKID_PACKNAMES)); + + assertArrayEquals(cgitMidx.getRawChunk(MIDX_CHUNKID_OBJECTOFFSETS), + jgitMidx.getRawChunk(MIDX_CHUNKID_OBJECTOFFSETS)); + + } + + private byte[] generateJGitMidx() throws IOException { + Map<String, PackIndex> indexes = new HashMap<>(); + for (Pack pack : db.getObjectDatabase().getPacks()) { + PackFile packFile = pack.getPackFile().create(PackExt.INDEX); + indexes.put(packFile.getName(), pack.getIndex()); + } + + MultiPackIndexWriter writer = new MultiPackIndexWriter(); + ByteArrayOutputStream out = new ByteArrayOutputStream(); + writer.write(NullProgressMonitor.INSTANCE, out, indexes); + return out.toByteArray(); + } + + private int run_cgit_multipackindex_write() + throws IOException, InterruptedException { + String[] command = new String[] { "git", "multi-pack-index", "write" }; + Process proc = Runtime.getRuntime().exec(command, new String[0], + db.getDirectory()); + return proc.waitFor(); + } + + private int run_cgit_multipackindex_verify() + throws IOException, InterruptedException { + String[] command = new String[] { "git", "multi-pack-index", "verify" }; + Process proc = Runtime.getRuntime().exec(command, new String[0], + db.getDirectory()); + return proc.waitFor(); + } + + private byte[] readCgitMidx() throws IOException { + File midx = getMIdxStandardLocation(); + assertTrue("cgit multi-pack-index exists", midx.exists()); + return Files.readAllBytes(midx.toPath()); + } + + private void writeMidx(byte[] midx) throws IOException { + File midxFile = getMIdxStandardLocation(); + Files.write(midxFile.toPath(), midx); + } + + private File getMIdxStandardLocation() { + return new File( + ((ObjectDirectory) db.getObjectDatabase()).getPackDirectory(), + "multi-pack-index"); + } + + private byte[] trimPadding(byte[] data) { + // Chunk MUST have one \0, we want to remove any extra \0 + int newEnd = data.length - 1; + while (newEnd - 1 >= 0 && data[newEnd - 1] == 0) { + newEnd--; + } + + if (newEnd == data.length - 1) { + return data; + } + return Arrays.copyOfRange(data, 0, newEnd + 1); + } + + private static class RawMultiPackIndex { + private final List<ChunkSegment> chunks; + + private final byte[] midx; + + private RawMultiPackIndex(byte[] midx) { + this.chunks = readChunks(midx); + this.midx = midx; + } + + long getChunkSize(int chunkId) { + int chunkPos = findChunkPosition(chunks, chunkId); + return chunks.get(chunkPos + 1).offset + - chunks.get(chunkPos).offset; + } + + long getOffset(int chunkId) { + return chunks.get(findChunkPosition(chunks, chunkId)).offset; + } + + private long getNextOffset(int chunkId) { + return chunks.get(findChunkPosition(chunks, chunkId) + 1).offset; + } + + byte[] getRawChunk(int chunkId) { + int start = (int) getOffset(chunkId); + int end = (int) getNextOffset(chunkId); + return Arrays.copyOfRange(midx, start, end); + } + + private static int findChunkPosition(List<ChunkSegment> chunks, + int id) { + int chunkPos = -1; + for (int i = 0; i < chunks.size(); i++) { + if (chunks.get(i).id() == id) { + chunkPos = i; + break; + } + } + if (chunkPos == -1) { + throw new IllegalStateException("Chunk doesn't exist"); + } + return chunkPos; + } + + private List<ChunkSegment> readChunks(byte[] midx) { + // Read the number of "chunkOffsets" (1 byte) + int chunkCount = midx[6]; + byte[] lookupBuffer = new byte[CHUNK_LOOKUP_WIDTH + * (chunkCount + 1)]; + System.arraycopy(midx, 12, lookupBuffer, 0, lookupBuffer.length); + + List<ChunkSegment> chunks = new ArrayList<>(chunkCount + 1); + for (int i = 0; i <= chunkCount; i++) { + // chunks[chunkCount] is just a marker, in order to record the + // length of the last chunk. + int id = NB.decodeInt32(lookupBuffer, i * 12); + long offset = NB.decodeInt64(lookupBuffer, i * 12 + 4); + chunks.add(new ChunkSegment(id, offset)); + } + return chunks; + } + } + + private record ChunkSegment(int id, long offset) { + } +} diff --git a/org.eclipse.jgit.test/tst/org/eclipse/jgit/internal/storage/midx/MultiPackIndexWriterTest.java b/org.eclipse.jgit.test/tst/org/eclipse/jgit/internal/storage/midx/MultiPackIndexWriterTest.java new file mode 100644 index 0000000000..82f3eb1e08 --- /dev/null +++ b/org.eclipse.jgit.test/tst/org/eclipse/jgit/internal/storage/midx/MultiPackIndexWriterTest.java @@ -0,0 +1,161 @@ +/* + * Copyright (C) 2025, Google Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Distribution License v. 1.0 which is available at + * https://www.eclipse.org/org/documents/edl-v10.php. + * + * SPDX-License-Identifier: BSD-3-Clause + */ +package org.eclipse.jgit.internal.storage.midx; + +import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.CHUNK_LOOKUP_WIDTH; +import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.MIDX_CHUNKID_LARGEOFFSETS; +import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.MIDX_CHUNKID_OBJECTOFFSETS; +import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.MIDX_CHUNKID_OIDFANOUT; +import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.MIDX_CHUNKID_OIDLOOKUP; +import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.MIDX_CHUNKID_PACKNAMES; +import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.MIDX_CHUNKID_REVINDEX; +import static org.junit.Assert.assertEquals; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Map; + +import org.eclipse.jgit.internal.storage.file.PackIndex; +import org.eclipse.jgit.junit.FakeIndexFactory; +import org.eclipse.jgit.junit.FakeIndexFactory.IndexObject; +import org.eclipse.jgit.lib.NullProgressMonitor; +import org.eclipse.jgit.util.NB; +import org.junit.Test; + +public class MultiPackIndexWriterTest { + + @Test + public void write_allSmallOffsets() throws IOException { + PackIndex index1 = indexOf( + object("0000000000000000000000000000000000000001", 500), + object("0000000000000000000000000000000000000003", 1500), + object("0000000000000000000000000000000000000005", 3000)); + PackIndex index2 = indexOf( + object("0000000000000000000000000000000000000002", 500), + object("0000000000000000000000000000000000000004", 1500), + object("0000000000000000000000000000000000000006", 3000)); + + Map<String, PackIndex> data = Map.of("packname1", index1, "packname2", + index2); + + MultiPackIndexWriter writer = new MultiPackIndexWriter(); + ByteArrayOutputStream out = new ByteArrayOutputStream(); + writer.write(NullProgressMonitor.INSTANCE, out, data); + // header (12 bytes) + // + chunkHeader (6 * 12 bytes) + // + fanout table (256 * 4 bytes) + // + OIDs (6 * 20 bytes) + // + (pack, offset) pairs (6 * 8) + // + RIDX (6 * 4 bytes) + // + packfile names (2 * 10) + // + checksum (20) + assertEquals(1340, out.size()); + List<Integer> chunkIds = readChunkIds(out); + assertEquals(5, chunkIds.size()); + assertEquals(0, chunkIds.indexOf(MIDX_CHUNKID_OIDFANOUT)); + assertEquals(1, chunkIds.indexOf(MIDX_CHUNKID_OIDLOOKUP)); + assertEquals(2, chunkIds.indexOf(MIDX_CHUNKID_OBJECTOFFSETS)); + assertEquals(3, chunkIds.indexOf(MIDX_CHUNKID_REVINDEX)); + assertEquals(4, chunkIds.indexOf(MIDX_CHUNKID_PACKNAMES)); + } + + @Test + public void write_smallOffset_limit() throws IOException { + PackIndex index1 = indexOf( + object("0000000000000000000000000000000000000001", 500), + object("0000000000000000000000000000000000000003", 1500), + object("0000000000000000000000000000000000000005", (1L << 32) -1)); + PackIndex index2 = indexOf( + object("0000000000000000000000000000000000000002", 500), + object("0000000000000000000000000000000000000004", 1500), + object("0000000000000000000000000000000000000006", 3000)); + Map<String, PackIndex> data = + Map.of("packname1", index1, "packname2", index2); + + MultiPackIndexWriter writer = new MultiPackIndexWriter(); + ByteArrayOutputStream out = new ByteArrayOutputStream(); + writer.write(NullProgressMonitor.INSTANCE, out, data); + // header (12 bytes) + // + chunkHeader (6 * 12 bytes) + // + fanout table (256 * 4 bytes) + // + OIDs (6 * 20 bytes) + // + (pack, offset) pairs (6 * 8) + // + RIDX (6 * 4 bytes) + // + packfile names (2 * 10) + // + checksum (20) + assertEquals(1340, out.size()); + List<Integer> chunkIds = readChunkIds(out); + assertEquals(5, chunkIds.size()); + assertEquals(0, chunkIds.indexOf(MIDX_CHUNKID_OIDFANOUT)); + assertEquals(1, chunkIds.indexOf(MIDX_CHUNKID_OIDLOOKUP)); + assertEquals(2, chunkIds.indexOf(MIDX_CHUNKID_OBJECTOFFSETS)); + assertEquals(3, chunkIds.indexOf(MIDX_CHUNKID_REVINDEX)); + assertEquals(4, chunkIds.indexOf(MIDX_CHUNKID_PACKNAMES)); + } + + @Test + public void write_largeOffset() throws IOException { + PackIndex index1 = indexOf( + object("0000000000000000000000000000000000000001", 500), + object("0000000000000000000000000000000000000003", 1500), + object("0000000000000000000000000000000000000005", 1L << 32)); + PackIndex index2 = indexOf( + object("0000000000000000000000000000000000000002", 500), + object("0000000000000000000000000000000000000004", 1500), + object("0000000000000000000000000000000000000006", 3000)); + Map<String, PackIndex> data = + Map.of("packname1", index1, "packname2", index2); + + MultiPackIndexWriter writer = new MultiPackIndexWriter(); + ByteArrayOutputStream out = new ByteArrayOutputStream(); + writer.write(NullProgressMonitor.INSTANCE, out, data); + // header (12 bytes) + // + chunkHeader (7 * 12 bytes) + // + fanout table (256 * 4 bytes) + // + OIDs (6 * 20 bytes) + // + (pack, offset) pairs (6 * 8) + // + (large-offset) (1 * 8) + // + RIDX (6 * 4 bytes) + // + packfile names (2 * 10) + // + checksum (20) + assertEquals(1360, out.size()); + List<Integer> chunkIds = readChunkIds(out); + assertEquals(6, chunkIds.size()); + assertEquals(0, chunkIds.indexOf(MIDX_CHUNKID_OIDFANOUT)); + assertEquals(1, chunkIds.indexOf(MIDX_CHUNKID_OIDLOOKUP)); + assertEquals(2, chunkIds.indexOf(MIDX_CHUNKID_OBJECTOFFSETS)); + assertEquals(3, chunkIds.indexOf(MIDX_CHUNKID_LARGEOFFSETS)); + assertEquals(4, chunkIds.indexOf(MIDX_CHUNKID_REVINDEX)); + assertEquals(5, chunkIds.indexOf(MIDX_CHUNKID_PACKNAMES)); + } + + private List<Integer> readChunkIds(ByteArrayOutputStream out) { + List<Integer> chunkIds = new ArrayList<>(); + byte[] raw = out.toByteArray(); + int numChunks = raw[6]; + int position = 12; + for (int i = 0; i < numChunks; i++) { + chunkIds.add(NB.decodeInt32(raw, position)); + position += CHUNK_LOOKUP_WIDTH; + } + return chunkIds; + } + + private static PackIndex indexOf(IndexObject... objs) { + return FakeIndexFactory.indexOf(Arrays.asList(objs)); + } + + private static IndexObject object(String name, long offset) { + return new IndexObject(name, offset); + } +} diff --git a/org.eclipse.jgit/resources/org/eclipse/jgit/internal/JGitText.properties b/org.eclipse.jgit/resources/org/eclipse/jgit/internal/JGitText.properties index 7c47d3a04f..392f4e9954 100644 --- a/org.eclipse.jgit/resources/org/eclipse/jgit/internal/JGitText.properties +++ b/org.eclipse.jgit/resources/org/eclipse/jgit/internal/JGitText.properties @@ -529,6 +529,8 @@ mkDirsFailed=Creating directories for {0} failed month=month months=months monthsAgo={0} months ago +multiPackIndexUnexpectedSize=MultiPack index: expected %d bytes but out has %d bytes +multiPackIndexWritingCancelled=Multipack index writing was canceled multipleMergeBasesFor=Multiple merge bases for:\n {0}\n {1} found:\n {2}\n {3} nameMustNotBeNullOrEmpty=Ref name must not be null or empty. need2Arguments=Need 2 arguments diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/internal/JGitText.java b/org.eclipse.jgit/src/org/eclipse/jgit/internal/JGitText.java index 07e882498c..d7fe456946 100644 --- a/org.eclipse.jgit/src/org/eclipse/jgit/internal/JGitText.java +++ b/org.eclipse.jgit/src/org/eclipse/jgit/internal/JGitText.java @@ -559,6 +559,8 @@ public class JGitText extends TranslationBundle { /***/ public String month; /***/ public String months; /***/ public String monthsAgo; + /***/ public String multiPackIndexUnexpectedSize; + /***/ public String multiPackIndexWritingCancelled; /***/ public String multipleMergeBasesFor; /***/ public String nameMustNotBeNullOrEmpty; /***/ public String need2Arguments; diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/midx/MultiPackIndexConstants.java b/org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/midx/MultiPackIndexConstants.java new file mode 100644 index 0000000000..6122a9a143 --- /dev/null +++ b/org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/midx/MultiPackIndexConstants.java @@ -0,0 +1,55 @@ +/* + * Copyright (C) 2025, Google Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Distribution License v. 1.0 which is available at + * https://www.eclipse.org/org/documents/edl-v10.php. + * + * SPDX-License-Identifier: BSD-3-Clause + */ +package org.eclipse.jgit.internal.storage.midx; + +class MultiPackIndexConstants { + static final int MIDX_SIGNATURE = 0x4d494458; /* MIDX */ + + static final byte MIDX_VERSION = 1; + + /** + * We infer the length of object IDs (OIDs) from this value: + * + * <pre> + * 1 => SHA-1 + * 2 => SHA-256 + * </pre> + */ + static final byte OID_HASH_VERSION = 1; + + static final int MULTIPACK_INDEX_FANOUT_SIZE = 4 * 256; + + /** + * First 4 bytes describe the chunk id. Value 0 is a terminating label. + * Other 8 bytes provide the byte-offset in current file for chunk to start. + */ + static final int CHUNK_LOOKUP_WIDTH = 12; + + /** "PNAM" chunk */ + static final int MIDX_CHUNKID_PACKNAMES = 0x504e414d; + + /** "OIDF" chunk */ + static final int MIDX_CHUNKID_OIDFANOUT = 0x4f494446; + + /** "OIDL" chunk */ + static final int MIDX_CHUNKID_OIDLOOKUP = 0x4f49444c; + + /** "OOFF" chunk */ + static final int MIDX_CHUNKID_OBJECTOFFSETS = 0x4f4f4646; + + /** "LOFF" chunk */ + static final int MIDX_CHUNKID_LARGEOFFSETS = 0x4c4f4646; + + /** "RIDX" chunk */ + static final int MIDX_CHUNKID_REVINDEX = 0x52494458; + + private MultiPackIndexConstants() { + } +} diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/midx/MultiPackIndexWriter.java b/org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/midx/MultiPackIndexWriter.java new file mode 100644 index 0000000000..28d2fb2334 --- /dev/null +++ b/org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/midx/MultiPackIndexWriter.java @@ -0,0 +1,422 @@ +/* + * Copyright (C) 2025, Google Inc. + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Distribution License v. 1.0 which is available at + * https://www.eclipse.org/org/documents/edl-v10.php. + * + * SPDX-License-Identifier: BSD-3-Clause + */ +package org.eclipse.jgit.internal.storage.midx; + +import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.CHUNK_LOOKUP_WIDTH; +import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.MIDX_CHUNKID_LARGEOFFSETS; +import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.MIDX_CHUNKID_OBJECTOFFSETS; +import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.MIDX_CHUNKID_OIDFANOUT; +import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.MIDX_CHUNKID_OIDLOOKUP; +import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.MIDX_CHUNKID_PACKNAMES; +import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.MIDX_CHUNKID_REVINDEX; +import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.MIDX_SIGNATURE; +import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.MIDX_VERSION; +import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.MULTIPACK_INDEX_FANOUT_SIZE; +import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.OID_HASH_VERSION; +import static org.eclipse.jgit.lib.Constants.OBJECT_ID_LENGTH; + +import java.io.IOException; +import java.io.InterruptedIOException; +import java.io.OutputStream; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; + +import org.eclipse.jgit.internal.JGitText; +import org.eclipse.jgit.internal.storage.file.PackIndex; +import org.eclipse.jgit.internal.storage.io.CancellableDigestOutputStream; +import org.eclipse.jgit.internal.storage.midx.PackIndexMerger.MidxMutableEntry; +import org.eclipse.jgit.lib.ProgressMonitor; +import org.eclipse.jgit.util.NB; + +/** + * Writes a collection of indexes as a multipack index. + * <p> + * See <a href= + * "https://git-scm.com/docs/pack-format#_multi_pack_index_midx_files_have_the_following_format">multipack + * index format spec</a> + */ +public class MultiPackIndexWriter { + + private static final int LIMIT_31_BITS = (1 << 31) - 1; + + private static final int MIDX_HEADER_SIZE = 12; + + /** + * Writes the inputs in the multipack index format in the outputStream. + * + * @param monitor + * progress monitor + * @param outputStream + * stream to write the multipack index file + * @param inputs + * pairs of name and index for each pack to include in the + * multipack index. + * @throws IOException + * Error writing to the stream + */ + public void write(ProgressMonitor monitor, OutputStream outputStream, + Map<String, PackIndex> inputs) throws IOException { + PackIndexMerger data = new PackIndexMerger(inputs); + + // List of chunks in the order they need to be written + List<ChunkHeader> chunkHeaders = createChunkHeaders(data); + long expectedSize = calculateExpectedSize(chunkHeaders); + try (CancellableDigestOutputStream out = new CancellableDigestOutputStream( + monitor, outputStream)) { + writeHeader(out, chunkHeaders.size(), data.getPackCount()); + writeChunkLookup(out, chunkHeaders); + + WriteContext ctx = new WriteContext(out, data); + for (ChunkHeader chunk : chunkHeaders) { + chunk.writerFn.write(ctx); + } + writeCheckSum(out); + if (expectedSize != out.length()) { + throw new IllegalStateException(String.format( + JGitText.get().multiPackIndexUnexpectedSize, + expectedSize, out.length())); + } + } catch (InterruptedIOException e) { + throw new IOException(JGitText.get().multiPackIndexWritingCancelled, + e); + } + } + + private static long calculateExpectedSize(List<ChunkHeader> chunks) { + int chunkLookup = (chunks.size() + 1) * CHUNK_LOOKUP_WIDTH; + long chunkContent = chunks.stream().mapToLong(c -> c.size).sum(); + return /* header */ 12 + chunkLookup + chunkContent + /* CRC */ 20; + } + + private List<ChunkHeader> createChunkHeaders(PackIndexMerger data) { + List<ChunkHeader> chunkHeaders = new ArrayList<>(); + chunkHeaders.add(new ChunkHeader(MIDX_CHUNKID_OIDFANOUT, + MULTIPACK_INDEX_FANOUT_SIZE, this::writeFanoutTable)); + chunkHeaders.add(new ChunkHeader(MIDX_CHUNKID_OIDLOOKUP, + (long) data.getUniqueObjectCount() * OBJECT_ID_LENGTH, + this::writeOidLookUp)); + chunkHeaders.add(new ChunkHeader(MIDX_CHUNKID_OBJECTOFFSETS, + 8L * data.getUniqueObjectCount(), this::writeObjectOffsets)); + if (data.needsLargeOffsetsChunk()) { + chunkHeaders.add(new ChunkHeader(MIDX_CHUNKID_LARGEOFFSETS, + 8L * data.getOffsetsOver31BitsCount(), + this::writeObjectLargeOffsets)); + } + chunkHeaders.add(new ChunkHeader(MIDX_CHUNKID_REVINDEX, + 4L * data.getUniqueObjectCount(), this::writeRidx)); + + int packNamesSize = data.getPackNames().stream() + .mapToInt(String::length).map(i -> i + 1 /* null at the end */) + .sum(); + chunkHeaders.add(new ChunkHeader(MIDX_CHUNKID_PACKNAMES, packNamesSize, + this::writePackfileNames)); + return chunkHeaders; + } + + /** + * Write the first 12 bytes of the multipack index. + * <p> + * These bytes include things like magic number, version, number of + * chunks... + * + * @param out + * output stream to write + * @param numChunks + * number of chunks this multipack index is going to have + * @param packCount + * number of packs covered by this multipack index + * @throws IOException + * error writing to the output stream + */ + private void writeHeader(CancellableDigestOutputStream out, int numChunks, + int packCount) throws IOException { + byte[] headerBuffer = new byte[MIDX_HEADER_SIZE]; + NB.encodeInt32(headerBuffer, 0, MIDX_SIGNATURE); + byte[] buff = { MIDX_VERSION, OID_HASH_VERSION, (byte) numChunks, + (byte) 0 }; + System.arraycopy(buff, 0, headerBuffer, 4, 4); + NB.encodeInt32(headerBuffer, 8, packCount); + out.write(headerBuffer, 0, headerBuffer.length); + out.flush(); + } + + /** + * Write a table of "chunkId, start-offset", with a special value "0, + * end-of-previous_chunk", to mark the end. + * + * @param out + * output stream to write + * @param chunkHeaders + * list of chunks in the order they are expected to be written + * @throws IOException + * error writing to the output stream + */ + private void writeChunkLookup(CancellableDigestOutputStream out, + List<ChunkHeader> chunkHeaders) throws IOException { + + // first chunk will start at header + this lookup block + long chunkStart = MIDX_HEADER_SIZE + + (long) (chunkHeaders.size() + 1) * CHUNK_LOOKUP_WIDTH; + byte[] chunkEntry = new byte[CHUNK_LOOKUP_WIDTH]; + for (ChunkHeader chunkHeader : chunkHeaders) { + NB.encodeInt32(chunkEntry, 0, chunkHeader.chunkId); + NB.encodeInt64(chunkEntry, 4, chunkStart); + out.write(chunkEntry); + chunkStart += chunkHeader.size; + } + // Terminating label for the block + // (chunkid 0, offset where the next block would start) + NB.encodeInt32(chunkEntry, 0, 0); + NB.encodeInt64(chunkEntry, 4, chunkStart); + out.write(chunkEntry); + } + + /** + * Write the fanout table for the object ids + * <p> + * Table with 256 entries (one byte), where the ith entry, F[i], stores the + * number of OIDs with first byte at most i. Thus, F[255] stores the total + * number of objects. + * + * @param ctx + * write context + * @throws IOException + * error writing to the output stream + */ + + private void writeFanoutTable(WriteContext ctx) throws IOException { + byte[] tmp = new byte[4]; + int[] fanout = new int[256]; + Iterator<MidxMutableEntry> iterator = ctx.data.bySha1Iterator(); + while (iterator.hasNext()) { + MidxMutableEntry e = iterator.next(); + fanout[e.getObjectId().getFirstByte() & 0xff]++; + } + for (int i = 1; i < fanout.length; i++) { + fanout[i] += fanout[i - 1]; + } + for (int n : fanout) { + NB.encodeInt32(tmp, 0, n); + ctx.out.write(tmp, 0, 4); + } + } + + /** + * Write the OID lookup chunk + * <p> + * A list of OIDs in sha1 order. + * + * @param ctx + * write context + * @throws IOException + * error writing to the output stream + */ + private void writeOidLookUp(WriteContext ctx) throws IOException { + byte[] tmp = new byte[OBJECT_ID_LENGTH]; + + Iterator<MidxMutableEntry> iterator = ctx.data.bySha1Iterator(); + while (iterator.hasNext()) { + MidxMutableEntry e = iterator.next(); + e.getObjectId().copyRawTo(tmp, 0); + ctx.out.write(tmp, 0, OBJECT_ID_LENGTH); + } + } + + /** + * Write the object offsets chunk + * <p> + * A list of offsets, parallel to the list of OIDs. If the offset is too + * large (see {@link #fitsIn31bits(long)}), this contains the position in + * the large offsets list (marked with a 1 in the most significant bit). + * + * @param ctx + * write context + * @throws IOException + * error writing to the output stream + */ + private void writeObjectOffsets(WriteContext ctx) throws IOException { + byte[] entry = new byte[8]; + Iterator<MidxMutableEntry> iterator = ctx.data.bySha1Iterator(); + while (iterator.hasNext()) { + MidxMutableEntry e = iterator.next(); + NB.encodeInt32(entry, 0, e.getPackId()); + if (!ctx.data.needsLargeOffsetsChunk() + || fitsIn31bits(e.getOffset())) { + NB.encodeInt32(entry, 4, (int) e.getOffset()); + } else { + int offloadedPosition = ctx.largeOffsets.append(e.getOffset()); + NB.encodeInt32(entry, 4, offloadedPosition | (1 << 31)); + } + ctx.out.write(entry); + } + } + + /** + * Writes the reverse index chunk + * <p> + * This stores the position of the objects in the main index, ordered first + * by pack and then by offset + * + * @param ctx + * write context + * @throws IOException + * erorr writing to the output stream + */ + private void writeRidx(WriteContext ctx) throws IOException { + Map<Integer, List<OffsetPosition>> packOffsets = new HashMap<>( + ctx.data.getPackCount()); + // TODO(ifrade): Brute force solution loading all offsets/packs in + // memory. We could also iterate reverse indexes looking up + // their position in the midx (and discarding if the pack doesn't + // match). + Iterator<MidxMutableEntry> iterator = ctx.data.bySha1Iterator(); + int midxPosition = 0; + while (iterator.hasNext()) { + MidxMutableEntry e = iterator.next(); + OffsetPosition op = new OffsetPosition(e.getOffset(), midxPosition); + midxPosition++; + packOffsets.computeIfAbsent(e.getPackId(), k -> new ArrayList<>()) + .add(op); + } + + for (int i = 0; i < ctx.data.getPackCount(); i++) { + List<OffsetPosition> offsetsForPack = packOffsets.get(i); + if (offsetsForPack.isEmpty()) { + continue; + } + offsetsForPack.sort(Comparator.comparing(OffsetPosition::offset)); + byte[] ridxForPack = new byte[4 * offsetsForPack.size()]; + for (int j = 0; j < offsetsForPack.size(); j++) { + NB.encodeInt32(ridxForPack, j * 4, + offsetsForPack.get(j).position); + } + ctx.out.write(ridxForPack); + } + } + + /** + * Write the large offset chunk + * <p> + * A list of large offsets (long). The regular offset chunk will point to a + * position here. + * + * @param ctx + * writer context + * @throws IOException + * error writing to the output stream + */ + private void writeObjectLargeOffsets(WriteContext ctx) throws IOException { + ctx.out.write(ctx.largeOffsets.offsets, 0, + ctx.largeOffsets.bytePosition); + } + + /** + * Write the list of packfiles chunk + * <p> + * List of packfiles (in lexicographical order) with an \0 at the end + * + * @param ctx + * writer context + * @throws IOException + * error writing to the output stream + */ + private void writePackfileNames(WriteContext ctx) throws IOException { + for (String packName : ctx.data.getPackNames()) { + // Spec doesn't talk about encoding. + ctx.out.write(packName.getBytes(StandardCharsets.UTF_8)); + ctx.out.write(0); + } + } + + /** + * Write final checksum of the data written to the stream + * + * @param out + * output stream used to write + * @throws IOException + * error writing to the output stream + */ + private void writeCheckSum(CancellableDigestOutputStream out) + throws IOException { + out.write(out.getDigest()); + out.flush(); + } + + + private record OffsetPosition(long offset, int position) { + } + + /** + * If there is at least one offset value larger than 2^32-1, then the large + * offset chunk must exist, and offsets larger than 2^31-1 must be stored in + * it instead + * + * @param offset object offset + * + * @return true if the offset fits in 31 bits + */ + private static boolean fitsIn31bits(long offset) { + return offset <= LIMIT_31_BITS; + } + + private static class LargeOffsets { + private final byte[] offsets; + + private int bytePosition; + + LargeOffsets(int largeOffsetsCount) { + offsets = new byte[largeOffsetsCount * 8]; + bytePosition = 0; + } + + /** + * Add an offset to the large offset chunk + * + * @param largeOffset + * a large offset + * @return the position of the just inserted offset (as in number of + * offsets, NOT in bytes) + */ + int append(long largeOffset) { + int at = bytePosition; + NB.encodeInt64(offsets, at, largeOffset); + bytePosition += 8; + return at / 8; + } + } + + private record ChunkHeader(int chunkId, long size, ChunkWriter writerFn) { + } + + @FunctionalInterface + private interface ChunkWriter { + void write(WriteContext ctx) throws IOException; + } + + private static class WriteContext { + final CancellableDigestOutputStream out; + + final PackIndexMerger data; + + final LargeOffsets largeOffsets; + + WriteContext(CancellableDigestOutputStream out, PackIndexMerger data) { + this.out = out; + this.data = data; + this.largeOffsets = new LargeOffsets( + data.getOffsetsOver31BitsCount()); + } + } +} |