midx.MultiPackIndexWriter: a writer for the multipack index format

Fromg git documentation[1]: While the pack-indexes provide fast lookup per packfile, this performance degrades as the number of packfiles increases, because abbreviations need to inspect every packfile and we are more likely to have a miss on our most-recently-used packfile. For some large repositories, repacking into a single packfile is not feasible due to storage space or excessive repack times. (...) The multi-pack-index (MIDX for short) stores a list of objects and their offsets into multiple packfiles. (...) Thus, we can provide O(log N) lookup time for any number of packfiles. This is a writer of the multipack index format. The test only verifies the "shape" of the file, when we get a parser we can check also the values (specially the large offset handling). On the JGit repository, the multipack index generated by this writer passes the validation of `git multi-pack-index verify`. [1] https://git-scm.com/docs/pack-format#_multi_pack_index_midx_files_have_the_following_format Change-Id: I1fca599c4ebf28154f28f039c2c4cfe75b2dc79d
author: Ivan Frade <ifrade@google.com> 2025-01-23 15:49:30 -0800
committer: Ivan Frade <ifrade@google.com> 2025-02-12 12:04:44 -0800
commit: 072e93fded7229f00002232b33ed91a0ef614ddb (patch)
tree: be09481a186af4e3a55025f221a8306dae3c74d4
parent: b3230f25e57858958f4fc1d01070dfc0bf08370e (diff)
download: jgit-072e93fded7229f00002232b33ed91a0ef614ddb.tar.gz
jgit-072e93fded7229f00002232b33ed91a0ef614ddb.zip
6 files changed, 852 insertions, 0 deletions
diff --git a/org.eclipse.jgit.test/exttst/org/eclipse/jgit/internal/storage/midx/CgitMidxCompatibilityTest.java b/org.eclipse.jgit.test/exttst/org/eclipse/jgit/internal/storage/midx/CgitMidxCompatibilityTest.java
new file mode 100644
index 0000000000..bbff7d433f
--- /dev/null
+++ b/org.eclipse.jgit.test/exttst/org/eclipse/jgit/internal/storage/midx/CgitMidxCompatibilityTest.java
@@ -0,0 +1,210 @@
+/*
+ * Copyright (C) 2025, Google Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Distribution License v. 1.0 which is available at
+ * https://www.eclipse.org/org/documents/edl-v10.php.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+package org.eclipse.jgit.internal.storage.midx;
+
+import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.CHUNK_LOOKUP_WIDTH;
+import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.MIDX_CHUNKID_OBJECTOFFSETS;
+import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.MIDX_CHUNKID_OIDFANOUT;
+import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.MIDX_CHUNKID_OIDLOOKUP;
+import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.MIDX_CHUNKID_PACKNAMES;
+import static org.junit.Assert.assertArrayEquals;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.eclipse.jgit.internal.storage.file.ObjectDirectory;
+import org.eclipse.jgit.internal.storage.file.Pack;
+import org.eclipse.jgit.internal.storage.file.PackFile;
+import org.eclipse.jgit.internal.storage.file.PackIndex;
+import org.eclipse.jgit.internal.storage.pack.PackExt;
+import org.eclipse.jgit.lib.NullProgressMonitor;
+import org.eclipse.jgit.test.resources.SampleDataRepositoryTestCase;
+import org.eclipse.jgit.util.NB;
+import org.junit.Test;
+
+public class CgitMidxCompatibilityTest extends SampleDataRepositoryTestCase {
+
+	@Test
+	public void jgitMidx_verifyByCgit()
+			throws IOException, InterruptedException {
+		byte[] jgitMidxBytes = generateJGitMidx();
+		writeMidx(jgitMidxBytes);
+		assertEquals("cgit exit code", 0, run_cgit_multipackindex_verify());
+	}
+
+	@Test
+	public void compareBasicChunkSizes()
+			throws IOException, InterruptedException {
+		// We cannot compare byte-by-byte because there are optional chunks and
+		// it is not guaranteed what cgit and jgit will generate
+		byte[] jgitMidxBytes = generateJGitMidx();
+		assertEquals("cgit exit code", 0, run_cgit_multipackindex_write());
+		byte[] cgitMidxBytes = readCgitMidx();
+
+		RawMultiPackIndex jgitMidx = new RawMultiPackIndex(jgitMidxBytes);
+		RawMultiPackIndex cgitMidx = new RawMultiPackIndex(cgitMidxBytes);
+
+		// This is a fixed sized chunk
+		assertEquals(256 * 4, cgitMidx.getChunkSize(MIDX_CHUNKID_OIDFANOUT));
+		assertArrayEquals(cgitMidx.getRawChunk(MIDX_CHUNKID_OIDFANOUT),
+				jgitMidx.getRawChunk(MIDX_CHUNKID_OIDFANOUT));
+
+		assertArrayEquals(cgitMidx.getRawChunk(MIDX_CHUNKID_OIDLOOKUP),
+				jgitMidx.getRawChunk(MIDX_CHUNKID_OIDLOOKUP));
+
+		// The spec has changed from padding packnames to a multile of four, to
+		// move the packname chunk to the end of the file.
+		// git 2.48 pads the packs names to a multiple of 4
+		// jgit puts the chunk at the end
+		byte[] cgitPacknames = trimPadding(
+				cgitMidx.getRawChunk(MIDX_CHUNKID_PACKNAMES));
+		assertArrayEquals(cgitPacknames,
+				jgitMidx.getRawChunk(MIDX_CHUNKID_PACKNAMES));
+
+		assertArrayEquals(cgitMidx.getRawChunk(MIDX_CHUNKID_OBJECTOFFSETS),
+				jgitMidx.getRawChunk(MIDX_CHUNKID_OBJECTOFFSETS));
+
+	}
+
+	private byte[] generateJGitMidx() throws IOException {
+		Map<String, PackIndex> indexes = new HashMap<>();
+		for (Pack pack : db.getObjectDatabase().getPacks()) {
+			PackFile packFile = pack.getPackFile().create(PackExt.INDEX);
+			indexes.put(packFile.getName(), pack.getIndex());
+		}
+
+		MultiPackIndexWriter writer = new MultiPackIndexWriter();
+		ByteArrayOutputStream out = new ByteArrayOutputStream();
+		writer.write(NullProgressMonitor.INSTANCE, out, indexes);
+		return out.toByteArray();
+	}
+
+	private int run_cgit_multipackindex_write()
+			throws IOException, InterruptedException {
+		String[] command = new String[] { "git", "multi-pack-index", "write" };
+		Process proc = Runtime.getRuntime().exec(command, new String[0],
+				db.getDirectory());
+		return proc.waitFor();
+	}
+
+	private int run_cgit_multipackindex_verify()
+			throws IOException, InterruptedException {
+		String[] command = new String[] { "git", "multi-pack-index", "verify" };
+		Process proc = Runtime.getRuntime().exec(command, new String[0],
+				db.getDirectory());
+		return proc.waitFor();
+	}
+
+	private byte[] readCgitMidx() throws IOException {
+		File midx = getMIdxStandardLocation();
+		assertTrue("cgit multi-pack-index exists", midx.exists());
+		return Files.readAllBytes(midx.toPath());
+	}
+
+	private void writeMidx(byte[] midx) throws IOException {
+		File midxFile = getMIdxStandardLocation();
+		Files.write(midxFile.toPath(), midx);
+	}
+
+	private File getMIdxStandardLocation() {
+		return new File(
+				((ObjectDirectory) db.getObjectDatabase()).getPackDirectory(),
+				"multi-pack-index");
+	}
+
+	private byte[] trimPadding(byte[] data) {
+		// Chunk MUST have one \0, we want to remove any extra \0
+		int newEnd = data.length - 1;
+		while (newEnd - 1 >= 0 && data[newEnd - 1] == 0) {
+			newEnd--;
+		}
+
+		if (newEnd == data.length - 1) {
+			return data;
+		}
+		return Arrays.copyOfRange(data, 0, newEnd + 1);
+	}
+
+	private static class RawMultiPackIndex {
+		private final List<ChunkSegment> chunks;
+
+		private final byte[] midx;
+
+		private RawMultiPackIndex(byte[] midx) {
+			this.chunks = readChunks(midx);
+			this.midx = midx;
+		}
+
+		long getChunkSize(int chunkId) {
+			int chunkPos = findChunkPosition(chunks, chunkId);
+			return chunks.get(chunkPos + 1).offset
+					- chunks.get(chunkPos).offset;
+		}
+
+		long getOffset(int chunkId) {
+			return chunks.get(findChunkPosition(chunks, chunkId)).offset;
+		}
+
+		private long getNextOffset(int chunkId) {
+			return chunks.get(findChunkPosition(chunks, chunkId) + 1).offset;
+		}
+
+		byte[] getRawChunk(int chunkId) {
+			int start = (int) getOffset(chunkId);
+			int end = (int) getNextOffset(chunkId);
+			return Arrays.copyOfRange(midx, start, end);
+		}
+
+		private static int findChunkPosition(List<ChunkSegment> chunks,
+				int id) {
+			int chunkPos = -1;
+			for (int i = 0; i < chunks.size(); i++) {
+				if (chunks.get(i).id() == id) {
+					chunkPos = i;
+					break;
+				}
+			}
+			if (chunkPos == -1) {
+				throw new IllegalStateException("Chunk doesn't exist");
+			}
+			return chunkPos;
+		}
+
+		private List<ChunkSegment> readChunks(byte[] midx) {
+			// Read the number of "chunkOffsets" (1 byte)
+			int chunkCount = midx[6];
+			byte[] lookupBuffer = new byte[CHUNK_LOOKUP_WIDTH
+					* (chunkCount + 1)];
+			System.arraycopy(midx, 12, lookupBuffer, 0, lookupBuffer.length);
+
+			List<ChunkSegment> chunks = new ArrayList<>(chunkCount + 1);
+			for (int i = 0; i <= chunkCount; i++) {
+				// chunks[chunkCount] is just a marker, in order to record the
+				// length of the last chunk.
+				int id = NB.decodeInt32(lookupBuffer, i * 12);
+				long offset = NB.decodeInt64(lookupBuffer, i * 12 + 4);
+				chunks.add(new ChunkSegment(id, offset));
+			}
+			return chunks;
+		}
+	}
+
+	private record ChunkSegment(int id, long offset) {
+	}
+}
diff --git a/org.eclipse.jgit.test/tst/org/eclipse/jgit/internal/storage/midx/MultiPackIndexWriterTest.java b/org.eclipse.jgit.test/tst/org/eclipse/jgit/internal/storage/midx/MultiPackIndexWriterTest.java
new file mode 100644
index 0000000000..82f3eb1e08
--- /dev/null
+++ b/org.eclipse.jgit.test/tst/org/eclipse/jgit/internal/storage/midx/MultiPackIndexWriterTest.java
@@ -0,0 +1,161 @@
+/*
+ * Copyright (C) 2025, Google Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Distribution License v. 1.0 which is available at
+ * https://www.eclipse.org/org/documents/edl-v10.php.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+package org.eclipse.jgit.internal.storage.midx;
+
+import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.CHUNK_LOOKUP_WIDTH;
+import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.MIDX_CHUNKID_LARGEOFFSETS;
+import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.MIDX_CHUNKID_OBJECTOFFSETS;
+import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.MIDX_CHUNKID_OIDFANOUT;
+import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.MIDX_CHUNKID_OIDLOOKUP;
+import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.MIDX_CHUNKID_PACKNAMES;
+import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.MIDX_CHUNKID_REVINDEX;
+import static org.junit.Assert.assertEquals;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Map;
+
+import org.eclipse.jgit.internal.storage.file.PackIndex;
+import org.eclipse.jgit.junit.FakeIndexFactory;
+import org.eclipse.jgit.junit.FakeIndexFactory.IndexObject;
+import org.eclipse.jgit.lib.NullProgressMonitor;
+import org.eclipse.jgit.util.NB;
+import org.junit.Test;
+
+public class MultiPackIndexWriterTest {
+
+	@Test
+	public void write_allSmallOffsets() throws IOException {
+		PackIndex index1 = indexOf(
+				object("0000000000000000000000000000000000000001", 500),
+				object("0000000000000000000000000000000000000003", 1500),
+				object("0000000000000000000000000000000000000005", 3000));
+		PackIndex index2 = indexOf(
+				object("0000000000000000000000000000000000000002", 500),
+				object("0000000000000000000000000000000000000004", 1500),
+				object("0000000000000000000000000000000000000006", 3000));
+
+		Map<String, PackIndex> data = Map.of("packname1", index1, "packname2",
+				index2);
+
+		MultiPackIndexWriter writer = new MultiPackIndexWriter();
+		ByteArrayOutputStream out = new ByteArrayOutputStream();
+		writer.write(NullProgressMonitor.INSTANCE, out, data);
+		// header (12 bytes)
+		// + chunkHeader (6 * 12 bytes)
+		// + fanout table (256 * 4 bytes)
+		// + OIDs (6 * 20 bytes)
+		// + (pack, offset) pairs (6 * 8)
+		// + RIDX (6 * 4 bytes)
+		// + packfile names (2 * 10)
+		// + checksum (20)
+		assertEquals(1340, out.size());
+		List<Integer> chunkIds = readChunkIds(out);
+		assertEquals(5, chunkIds.size());
+		assertEquals(0, chunkIds.indexOf(MIDX_CHUNKID_OIDFANOUT));
+		assertEquals(1, chunkIds.indexOf(MIDX_CHUNKID_OIDLOOKUP));
+		assertEquals(2, chunkIds.indexOf(MIDX_CHUNKID_OBJECTOFFSETS));
+		assertEquals(3, chunkIds.indexOf(MIDX_CHUNKID_REVINDEX));
+		assertEquals(4, chunkIds.indexOf(MIDX_CHUNKID_PACKNAMES));
+	}
+
+	@Test
+	public void write_smallOffset_limit() throws IOException {
+		PackIndex index1 = indexOf(
+				object("0000000000000000000000000000000000000001", 500),
+				object("0000000000000000000000000000000000000003", 1500),
+				object("0000000000000000000000000000000000000005", (1L << 32) -1));
+		PackIndex index2 = indexOf(
+				object("0000000000000000000000000000000000000002", 500),
+				object("0000000000000000000000000000000000000004", 1500),
+				object("0000000000000000000000000000000000000006", 3000));
+		Map<String, PackIndex> data =
+				Map.of("packname1", index1, "packname2", index2);
+
+		MultiPackIndexWriter writer = new MultiPackIndexWriter();
+		ByteArrayOutputStream out = new ByteArrayOutputStream();
+		writer.write(NullProgressMonitor.INSTANCE, out, data);
+		// header (12 bytes)
+		// + chunkHeader (6 * 12 bytes)
+		// + fanout table (256 * 4 bytes)
+		// + OIDs (6 * 20 bytes)
+		// + (pack, offset) pairs (6 * 8)
+		// + RIDX (6 * 4 bytes)
+		// + packfile names (2 * 10)
+		// + checksum (20)
+		assertEquals(1340, out.size());
+		List<Integer> chunkIds = readChunkIds(out);
+		assertEquals(5, chunkIds.size());
+		assertEquals(0, chunkIds.indexOf(MIDX_CHUNKID_OIDFANOUT));
+		assertEquals(1, chunkIds.indexOf(MIDX_CHUNKID_OIDLOOKUP));
+		assertEquals(2, chunkIds.indexOf(MIDX_CHUNKID_OBJECTOFFSETS));
+		assertEquals(3, chunkIds.indexOf(MIDX_CHUNKID_REVINDEX));
+		assertEquals(4, chunkIds.indexOf(MIDX_CHUNKID_PACKNAMES));
+	}
+
+	@Test
+	public void write_largeOffset() throws IOException {
+		PackIndex index1 = indexOf(
+				object("0000000000000000000000000000000000000001", 500),
+				object("0000000000000000000000000000000000000003", 1500),
+				object("0000000000000000000000000000000000000005", 1L << 32));
+		PackIndex index2 = indexOf(
+				object("0000000000000000000000000000000000000002", 500),
+				object("0000000000000000000000000000000000000004", 1500),
+				object("0000000000000000000000000000000000000006", 3000));
+		Map<String, PackIndex> data =
+				Map.of("packname1", index1, "packname2", index2);
+
+		MultiPackIndexWriter writer = new MultiPackIndexWriter();
+		ByteArrayOutputStream out = new ByteArrayOutputStream();
+		writer.write(NullProgressMonitor.INSTANCE, out, data);
+		// header (12 bytes)
+		// + chunkHeader (7 * 12 bytes)
+		// + fanout table (256 * 4 bytes)
+		// + OIDs (6 * 20 bytes)
+		// + (pack, offset) pairs (6 * 8)
+		// + (large-offset) (1 * 8)
+		// + RIDX (6 * 4 bytes)
+		// + packfile names (2 * 10)
+		// + checksum (20)
+		assertEquals(1360, out.size());
+		List<Integer> chunkIds = readChunkIds(out);
+		assertEquals(6, chunkIds.size());
+		assertEquals(0, chunkIds.indexOf(MIDX_CHUNKID_OIDFANOUT));
+		assertEquals(1, chunkIds.indexOf(MIDX_CHUNKID_OIDLOOKUP));
+		assertEquals(2, chunkIds.indexOf(MIDX_CHUNKID_OBJECTOFFSETS));
+		assertEquals(3, chunkIds.indexOf(MIDX_CHUNKID_LARGEOFFSETS));
+		assertEquals(4, chunkIds.indexOf(MIDX_CHUNKID_REVINDEX));
+		assertEquals(5, chunkIds.indexOf(MIDX_CHUNKID_PACKNAMES));
+	}
+
+	private List<Integer> readChunkIds(ByteArrayOutputStream out) {
+		List<Integer> chunkIds = new ArrayList<>();
+		byte[] raw = out.toByteArray();
+		int numChunks = raw[6];
+		int position = 12;
+		for (int i = 0; i < numChunks; i++) {
+			chunkIds.add(NB.decodeInt32(raw, position));
+			position += CHUNK_LOOKUP_WIDTH;
+		}
+		return chunkIds;
+	}
+
+	private static PackIndex indexOf(IndexObject... objs) {
+		return FakeIndexFactory.indexOf(Arrays.asList(objs));
+	}
+
+	private static IndexObject object(String name, long offset) {
+		return new IndexObject(name, offset);
+	}
+}
diff --git a/org.eclipse.jgit/resources/org/eclipse/jgit/internal/JGitText.properties b/org.eclipse.jgit/resources/org/eclipse/jgit/internal/JGitText.properties
index 7c47d3a04f..392f4e9954 100644
--- a/org.eclipse.jgit/resources/org/eclipse/jgit/internal/JGitText.properties
+++ b/org.eclipse.jgit/resources/org/eclipse/jgit/internal/JGitText.properties
@@ -529,6 +529,8 @@ mkDirsFailed=Creating directories for {0} failed
 month=month
 months=months
 monthsAgo={0} months ago
+multiPackIndexUnexpectedSize=MultiPack index: expected %d bytes but out has %d bytes
+multiPackIndexWritingCancelled=Multipack index writing was canceled
 multipleMergeBasesFor=Multiple merge bases for:\n  {0}\n  {1} found:\n  {2}\n  {3}
 nameMustNotBeNullOrEmpty=Ref name must not be null or empty.
 need2Arguments=Need 2 arguments
diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/internal/JGitText.java b/org.eclipse.jgit/src/org/eclipse/jgit/internal/JGitText.java
index 07e882498c..d7fe456946 100644
--- a/org.eclipse.jgit/src/org/eclipse/jgit/internal/JGitText.java
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/internal/JGitText.java
@@ -559,6 +559,8 @@ public class JGitText extends TranslationBundle {
 	/***/ public String month;
 	/***/ public String months;
 	/***/ public String monthsAgo;
+	/***/ public String multiPackIndexUnexpectedSize;
+	/***/ public String multiPackIndexWritingCancelled;
 	/***/ public String multipleMergeBasesFor;
 	/***/ public String nameMustNotBeNullOrEmpty;
 	/***/ public String need2Arguments;
diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/midx/MultiPackIndexConstants.java b/org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/midx/MultiPackIndexConstants.java
new file mode 100644
index 0000000000..6122a9a143
--- /dev/null
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/midx/MultiPackIndexConstants.java
@@ -0,0 +1,55 @@
+/*
+ * Copyright (C) 2025, Google Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Distribution License v. 1.0 which is available at
+ * https://www.eclipse.org/org/documents/edl-v10.php.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+package org.eclipse.jgit.internal.storage.midx;
+
+class MultiPackIndexConstants {
+	static final int MIDX_SIGNATURE = 0x4d494458; /* MIDX */
+
+	static final byte MIDX_VERSION = 1;
+
+	/**
+	 * We infer the length of object IDs (OIDs) from this value:
+	 * 
+	 * <pre>
+	 * 1 => SHA-1
+	 * 2 => SHA-256
+	 * </pre>
+	 */
+	static final byte OID_HASH_VERSION = 1;
+
+	static final int MULTIPACK_INDEX_FANOUT_SIZE = 4 * 256;
+
+	/**
+	 * First 4 bytes describe the chunk id. Value 0 is a terminating label.
+	 * Other 8 bytes provide the byte-offset in current file for chunk to start.
+	 */
+	static final int CHUNK_LOOKUP_WIDTH = 12;
+
+	/** "PNAM" chunk */
+	static final int MIDX_CHUNKID_PACKNAMES = 0x504e414d;
+
+	/** "OIDF" chunk */
+	static final int MIDX_CHUNKID_OIDFANOUT = 0x4f494446;
+
+	/** "OIDL" chunk */
+	static final int MIDX_CHUNKID_OIDLOOKUP = 0x4f49444c;
+
+	/** "OOFF" chunk */
+	static final int MIDX_CHUNKID_OBJECTOFFSETS = 0x4f4f4646;
+
+	/** "LOFF" chunk */
+	static final int MIDX_CHUNKID_LARGEOFFSETS = 0x4c4f4646;
+
+	/** "RIDX" chunk */
+	static final int MIDX_CHUNKID_REVINDEX = 0x52494458;
+
+	private MultiPackIndexConstants() {
+	}
+}
diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/midx/MultiPackIndexWriter.java b/org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/midx/MultiPackIndexWriter.java
new file mode 100644
index 0000000000..28d2fb2334
--- /dev/null
+++ b/org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/midx/MultiPackIndexWriter.java
@@ -0,0 +1,422 @@
+/*
+ * Copyright (C) 2025, Google Inc.
+ *
+ * This program and the accompanying materials are made available under the
+ * terms of the Eclipse Distribution License v. 1.0 which is available at
+ * https://www.eclipse.org/org/documents/edl-v10.php.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+package org.eclipse.jgit.internal.storage.midx;
+
+import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.CHUNK_LOOKUP_WIDTH;
+import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.MIDX_CHUNKID_LARGEOFFSETS;
+import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.MIDX_CHUNKID_OBJECTOFFSETS;
+import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.MIDX_CHUNKID_OIDFANOUT;
+import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.MIDX_CHUNKID_OIDLOOKUP;
+import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.MIDX_CHUNKID_PACKNAMES;
+import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.MIDX_CHUNKID_REVINDEX;
+import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.MIDX_SIGNATURE;
+import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.MIDX_VERSION;
+import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.MULTIPACK_INDEX_FANOUT_SIZE;
+import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.OID_HASH_VERSION;
+import static org.eclipse.jgit.lib.Constants.OBJECT_ID_LENGTH;
+
+import java.io.IOException;
+import java.io.InterruptedIOException;
+import java.io.OutputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import org.eclipse.jgit.internal.JGitText;
+import org.eclipse.jgit.internal.storage.file.PackIndex;
+import org.eclipse.jgit.internal.storage.io.CancellableDigestOutputStream;
+import org.eclipse.jgit.internal.storage.midx.PackIndexMerger.MidxMutableEntry;
+import org.eclipse.jgit.lib.ProgressMonitor;
+import org.eclipse.jgit.util.NB;
+
+/**
+ * Writes a collection of indexes as a multipack index.
+ * <p>
+ * See <a href=
+ * "https://git-scm.com/docs/pack-format#_multi_pack_index_midx_files_have_the_following_format">multipack
+ * index format spec</a>
+ */
+public class MultiPackIndexWriter {
+
+	private static final int LIMIT_31_BITS = (1 << 31) - 1;
+
+	private static final int MIDX_HEADER_SIZE = 12;
+
+	/**
+	 * Writes the inputs in the multipack index format in the outputStream.
+	 *
+	 * @param monitor
+	 *            progress monitor
+	 * @param outputStream
+	 *            stream to write the multipack index file
+	 * @param inputs
+	 *            pairs of name and index for each pack to include in the
+	 *            multipack index.
+	 * @throws IOException
+	 *             Error writing to the stream
+	 */
+	public void write(ProgressMonitor monitor, OutputStream outputStream,
+			Map<String, PackIndex> inputs) throws IOException {
+		PackIndexMerger data = new PackIndexMerger(inputs);
+
+		// List of chunks in the order they need to be written
+		List<ChunkHeader> chunkHeaders = createChunkHeaders(data);
+		long expectedSize = calculateExpectedSize(chunkHeaders);
+		try (CancellableDigestOutputStream out = new CancellableDigestOutputStream(
+				monitor, outputStream)) {
+			writeHeader(out, chunkHeaders.size(), data.getPackCount());
+			writeChunkLookup(out, chunkHeaders);
+
+			WriteContext ctx = new WriteContext(out, data);
+			for (ChunkHeader chunk : chunkHeaders) {
+				chunk.writerFn.write(ctx);
+			}
+			writeCheckSum(out);
+			if (expectedSize != out.length()) {
+				throw new IllegalStateException(String.format(
+						JGitText.get().multiPackIndexUnexpectedSize,
+						expectedSize, out.length()));
+			}
+		} catch (InterruptedIOException e) {
+			throw new IOException(JGitText.get().multiPackIndexWritingCancelled,
+					e);
+		}
+	}
+
+	private static long calculateExpectedSize(List<ChunkHeader> chunks) {
+		int chunkLookup = (chunks.size() + 1) * CHUNK_LOOKUP_WIDTH;
+		long chunkContent = chunks.stream().mapToLong(c -> c.size).sum();
+		return /* header */ 12 + chunkLookup + chunkContent + /* CRC */ 20;
+	}
+
+	private List<ChunkHeader> createChunkHeaders(PackIndexMerger data) {
+		List<ChunkHeader> chunkHeaders = new ArrayList<>();
+		chunkHeaders.add(new ChunkHeader(MIDX_CHUNKID_OIDFANOUT,
+				MULTIPACK_INDEX_FANOUT_SIZE, this::writeFanoutTable));
+		chunkHeaders.add(new ChunkHeader(MIDX_CHUNKID_OIDLOOKUP,
+				(long) data.getUniqueObjectCount() * OBJECT_ID_LENGTH,
+				this::writeOidLookUp));
+		chunkHeaders.add(new ChunkHeader(MIDX_CHUNKID_OBJECTOFFSETS,
+				8L * data.getUniqueObjectCount(), this::writeObjectOffsets));
+		if (data.needsLargeOffsetsChunk()) {
+			chunkHeaders.add(new ChunkHeader(MIDX_CHUNKID_LARGEOFFSETS,
+					8L * data.getOffsetsOver31BitsCount(),
+					this::writeObjectLargeOffsets));
+		}
+		chunkHeaders.add(new ChunkHeader(MIDX_CHUNKID_REVINDEX,
+				4L * data.getUniqueObjectCount(), this::writeRidx));
+
+		int packNamesSize = data.getPackNames().stream()
+				.mapToInt(String::length).map(i -> i + 1 /* null at the end */)
+				.sum();
+		chunkHeaders.add(new ChunkHeader(MIDX_CHUNKID_PACKNAMES, packNamesSize,
+				this::writePackfileNames));
+		return chunkHeaders;
+	}
+
+	/**
+	 * Write the first 12 bytes of the multipack index.
+	 * <p>
+	 * These bytes include things like magic number, version, number of
+	 * chunks...
+	 *
+	 * @param out
+	 *            output stream to write
+	 * @param numChunks
+	 *            number of chunks this multipack index is going to have
+	 * @param packCount
+	 *            number of packs covered by this multipack index
+	 * @throws IOException
+	 *             error writing to the output stream
+	 */
+	private void writeHeader(CancellableDigestOutputStream out, int numChunks,
+			int packCount) throws IOException {
+		byte[] headerBuffer = new byte[MIDX_HEADER_SIZE];
+		NB.encodeInt32(headerBuffer, 0, MIDX_SIGNATURE);
+		byte[] buff = { MIDX_VERSION, OID_HASH_VERSION, (byte) numChunks,
+				(byte) 0 };
+		System.arraycopy(buff, 0, headerBuffer, 4, 4);
+		NB.encodeInt32(headerBuffer, 8, packCount);
+		out.write(headerBuffer, 0, headerBuffer.length);
+		out.flush();
+	}
+
+	/**
+	 * Write a table of "chunkId, start-offset", with a special value "0,
+	 * end-of-previous_chunk", to mark the end.
+	 *
+	 * @param out
+	 *            output stream to write
+	 * @param chunkHeaders
+	 *            list of chunks in the order they are expected to be written
+	 * @throws IOException
+	 *             error writing to the output stream
+	 */
+	private void writeChunkLookup(CancellableDigestOutputStream out,
+			List<ChunkHeader> chunkHeaders) throws IOException {
+
+		// first chunk will start at header + this lookup block
+		long chunkStart = MIDX_HEADER_SIZE
+				+ (long) (chunkHeaders.size() + 1) * CHUNK_LOOKUP_WIDTH;
+		byte[] chunkEntry = new byte[CHUNK_LOOKUP_WIDTH];
+		for (ChunkHeader chunkHeader : chunkHeaders) {
+			NB.encodeInt32(chunkEntry, 0, chunkHeader.chunkId);
+			NB.encodeInt64(chunkEntry, 4, chunkStart);
+			out.write(chunkEntry);
+			chunkStart += chunkHeader.size;
+		}
+		// Terminating label for the block
+		// (chunkid 0, offset where the next block would start)
+		NB.encodeInt32(chunkEntry, 0, 0);
+		NB.encodeInt64(chunkEntry, 4, chunkStart);
+		out.write(chunkEntry);
+	}
+
+	/**
+	 * Write the fanout table for the object ids
+	 * <p>
+	 * Table with 256 entries (one byte), where the ith entry, F[i], stores the
+	 * number of OIDs with first byte at most i. Thus, F[255] stores the total
+	 * number of objects.
+	 *
+	 * @param ctx
+	 *            write context
+	 * @throws IOException
+	 *             error writing to the output stream
+	 */
+
+	private void writeFanoutTable(WriteContext ctx) throws IOException {
+		byte[] tmp = new byte[4];
+		int[] fanout = new int[256];
+		Iterator<MidxMutableEntry> iterator = ctx.data.bySha1Iterator();
+		while (iterator.hasNext()) {
+			MidxMutableEntry e = iterator.next();
+			fanout[e.getObjectId().getFirstByte() & 0xff]++;
+		}
+		for (int i = 1; i < fanout.length; i++) {
+			fanout[i] += fanout[i - 1];
+		}
+		for (int n : fanout) {
+			NB.encodeInt32(tmp, 0, n);
+			ctx.out.write(tmp, 0, 4);
+		}
+	}
+
+	/**
+	 * Write the OID lookup chunk
+	 * <p>
+	 * A list of OIDs in sha1 order.
+	 *
+	 * @param ctx
+	 *            write context
+	 * @throws IOException
+	 *             error writing to the output stream
+	 */
+	private void writeOidLookUp(WriteContext ctx) throws IOException {
+		byte[] tmp = new byte[OBJECT_ID_LENGTH];
+
+		Iterator<MidxMutableEntry> iterator = ctx.data.bySha1Iterator();
+		while (iterator.hasNext()) {
+			MidxMutableEntry e = iterator.next();
+			e.getObjectId().copyRawTo(tmp, 0);
+			ctx.out.write(tmp, 0, OBJECT_ID_LENGTH);
+		}
+	}
+
+	/**
+	 * Write the object offsets chunk
+	 * <p>
+	 * A list of offsets, parallel to the list of OIDs. If the offset is too
+	 * large (see {@link #fitsIn31bits(long)}), this contains the position in
+	 * the large offsets list (marked with a 1 in the most significant bit).
+	 *
+	 * @param ctx
+	 *            write context
+	 * @throws IOException
+	 *             error writing to the output stream
+	 */
+	private void writeObjectOffsets(WriteContext ctx) throws IOException {
+		byte[] entry = new byte[8];
+		Iterator<MidxMutableEntry> iterator = ctx.data.bySha1Iterator();
+		while (iterator.hasNext()) {
+			MidxMutableEntry e = iterator.next();
+			NB.encodeInt32(entry, 0, e.getPackId());
+			if (!ctx.data.needsLargeOffsetsChunk()
+					|| fitsIn31bits(e.getOffset())) {
+				NB.encodeInt32(entry, 4, (int) e.getOffset());
+			} else {
+				int offloadedPosition = ctx.largeOffsets.append(e.getOffset());
+				NB.encodeInt32(entry, 4, offloadedPosition | (1 << 31));
+			}
+			ctx.out.write(entry);
+		}
+	}
+
+	/**
+	 * Writes the reverse index chunk
+	 * <p>
+	 * This stores the position of the objects in the main index, ordered first
+	 * by pack and then by offset
+	 *
+	 * @param ctx
+	 *            write context
+	 * @throws IOException
+	 *             erorr writing to the output stream
+	 */
+	private void writeRidx(WriteContext ctx) throws IOException {
+		Map<Integer, List<OffsetPosition>> packOffsets = new HashMap<>(
+				ctx.data.getPackCount());
+		// TODO(ifrade): Brute force solution loading all offsets/packs in
+		// memory. We could also iterate reverse indexes looking up
+		// their position in the midx (and discarding if the pack doesn't
+		// match).
+		Iterator<MidxMutableEntry> iterator = ctx.data.bySha1Iterator();
+		int midxPosition = 0;
+		while (iterator.hasNext()) {
+			MidxMutableEntry e = iterator.next();
+			OffsetPosition op = new OffsetPosition(e.getOffset(), midxPosition);
+			midxPosition++;
+			packOffsets.computeIfAbsent(e.getPackId(), k -> new ArrayList<>())
+					.add(op);
+		}
+
+		for (int i = 0; i < ctx.data.getPackCount(); i++) {
+			List<OffsetPosition> offsetsForPack = packOffsets.get(i);
+			if (offsetsForPack.isEmpty()) {
+				continue;
+			}
+			offsetsForPack.sort(Comparator.comparing(OffsetPosition::offset));
+			byte[] ridxForPack = new byte[4 * offsetsForPack.size()];
+			for (int j = 0; j < offsetsForPack.size(); j++) {
+				NB.encodeInt32(ridxForPack, j * 4,
+						offsetsForPack.get(j).position);
+			}
+			ctx.out.write(ridxForPack);
+		}
+	}
+
+	/**
+	 * Write the large offset chunk
+	 * <p>
+	 * A list of large offsets (long). The regular offset chunk will point to a
+	 * position here.
+	 *
+	 * @param ctx
+	 *            writer context
+	 * @throws IOException
+	 *             error writing to the output stream
+	 */
+	private void writeObjectLargeOffsets(WriteContext ctx) throws IOException {
+		ctx.out.write(ctx.largeOffsets.offsets, 0,
+				ctx.largeOffsets.bytePosition);
+	}
+
+	/**
+	 * Write the list of packfiles chunk
+	 * <p>
+	 * List of packfiles (in lexicographical order) with an \0 at the end
+	 *
+	 * @param ctx
+	 *            writer context
+	 * @throws IOException
+	 *             error writing to the output stream
+	 */
+	private void writePackfileNames(WriteContext ctx) throws IOException {
+		for (String packName : ctx.data.getPackNames()) {
+			// Spec doesn't talk about encoding.
+			ctx.out.write(packName.getBytes(StandardCharsets.UTF_8));
+			ctx.out.write(0);
+		}
+	}
+
+	/**
+	 * Write final checksum of the data written to the stream
+	 *
+	 * @param out
+	 *            output stream used to write
+	 * @throws IOException
+	 *             error writing to the output stream
+	 */
+	private void writeCheckSum(CancellableDigestOutputStream out)
+			throws IOException {
+		out.write(out.getDigest());
+		out.flush();
+	}
+
+
+	private record OffsetPosition(long offset, int position) {
+	}
+
+	/**
+	 * If there is at least one offset value larger than 2^32-1, then the large
+	 * offset chunk must exist, and offsets larger than 2^31-1 must be stored in
+	 * it instead
+	 *
+	 * @param offset object offset
+	 *
+	 * @return true if the offset fits in 31 bits
+	 */
+	private static boolean fitsIn31bits(long offset) {
+		return offset <= LIMIT_31_BITS;
+	}
+
+	private static class LargeOffsets {
+		private final byte[] offsets;
+
+		private int bytePosition;
+
+		LargeOffsets(int largeOffsetsCount) {
+			offsets = new byte[largeOffsetsCount * 8];
+			bytePosition = 0;
+		}
+
+		/**
+		 * Add an offset to the large offset chunk
+		 *
+		 * @param largeOffset
+		 *            a large offset
+		 * @return the position of the just inserted offset (as in number of
+		 *         offsets, NOT in bytes)
+		 */
+		int append(long largeOffset) {
+			int at = bytePosition;
+			NB.encodeInt64(offsets, at, largeOffset);
+			bytePosition += 8;
+			return at / 8;
+		}
+	}
+
+	private record ChunkHeader(int chunkId, long size, ChunkWriter writerFn) {
+	}
+
+	@FunctionalInterface
+	private interface ChunkWriter {
+		void write(WriteContext ctx) throws IOException;
+	}
+
+	private static class WriteContext {
+		final CancellableDigestOutputStream out;
+
+		final PackIndexMerger data;
+
+		final LargeOffsets largeOffsets;
+
+		WriteContext(CancellableDigestOutputStream out, PackIndexMerger data) {
+			this.out = out;
+			this.data = data;
+			this.largeOffsets = new LargeOffsets(
+					data.getOffsetsOver31BitsCount());
+		}
+	}
+}
author	Ivan Frade <ifrade@google.com>	2025-01-23 15:49:30 -0800
committer	Ivan Frade <ifrade@google.com>	2025-02-12 12:04:44 -0800
commit	072e93fded7229f00002232b33ed91a0ef614ddb (patch)
tree	be09481a186af4e3a55025f221a8306dae3c74d4
parent	b3230f25e57858958f4fc1d01070dfc0bf08370e (diff)
download	jgit-072e93fded7229f00002232b33ed91a0ef614ddb.tar.gz jgit-072e93fded7229f00002232b33ed91a0ef614ddb.zip