org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/midx/MultiPackIndexWriter.java


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428

/*
 * Copyright (C) 2025, Google LLC
 *
 * This program and the accompanying materials are made available under the
 * terms of the Eclipse Distribution License v. 1.0 which is available at
 * https://www.eclipse.org/org/documents/edl-v10.php.
 *
 * SPDX-License-Identifier: BSD-3-Clause
 */
package org.eclipse.jgit.internal.storage.midx;

import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.CHUNK_LOOKUP_WIDTH;
import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.MIDX_CHUNKID_LARGEOFFSETS;
import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.MIDX_CHUNKID_OBJECTOFFSETS;
import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.MIDX_CHUNKID_OIDFANOUT;
import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.MIDX_CHUNKID_OIDLOOKUP;
import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.MIDX_CHUNKID_PACKNAMES;
import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.MIDX_CHUNKID_REVINDEX;
import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.MIDX_SIGNATURE;
import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.MIDX_VERSION;
import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.MULTIPACK_INDEX_FANOUT_SIZE;
import static org.eclipse.jgit.internal.storage.midx.MultiPackIndexConstants.OID_HASH_VERSION;
import static org.eclipse.jgit.lib.Constants.OBJECT_ID_LENGTH;

import java.io.IOException;
import java.io.InterruptedIOException;
import java.io.OutputStream;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

import org.eclipse.jgit.internal.JGitText;
import org.eclipse.jgit.internal.storage.file.PackIndex;
import org.eclipse.jgit.internal.storage.io.CancellableDigestOutputStream;
import org.eclipse.jgit.internal.storage.midx.PackIndexMerger.MidxMutableEntry;
import org.eclipse.jgit.lib.ProgressMonitor;
import org.eclipse.jgit.util.NB;

/**
 * Writes a collection of indexes as a multipack index.
 * <p>
 * See <a href=
 * "https://git-scm.com/docs/pack-format#_multi_pack_index_midx_files_have_the_following_format">multipack
 * index format spec</a>
 *
 * @since 7.2
 */
public class MultiPackIndexWriter {

	private static final int LIMIT_31_BITS = (1 << 31) - 1;

	private static final int MIDX_HEADER_SIZE = 12;

	/**
	 * Writes the inputs in the multipack index format in the outputStream.
	 *
	 * @param monitor
	 *            progress monitor
	 * @param outputStream
	 *            stream to write the multipack index file
	 * @param inputs
	 *            pairs of name and index for each pack to include in the
	 *            multipack index.
	 * @return bytes written into the stream
	 * @throws IOException
	 *             Error writing to the stream
	 */
	public long write(ProgressMonitor monitor, OutputStream outputStream,
			Map<String, PackIndex> inputs) throws IOException {
		PackIndexMerger data = new PackIndexMerger(inputs);

		// List of chunks in the order they need to be written
		List<ChunkHeader> chunkHeaders = createChunkHeaders(data);
		long expectedSize = calculateExpectedSize(chunkHeaders);
		try (CancellableDigestOutputStream out = new CancellableDigestOutputStream(
				monitor, outputStream)) {
			writeHeader(out, chunkHeaders.size(), data.getPackCount());
			writeChunkLookup(out, chunkHeaders);

			WriteContext ctx = new WriteContext(out, data);
			for (ChunkHeader chunk : chunkHeaders) {
				chunk.writerFn.write(ctx);
			}
			writeCheckSum(out);
			if (expectedSize != out.length()) {
				throw new IllegalStateException(String.format(
						JGitText.get().multiPackIndexUnexpectedSize,
						Long.valueOf(expectedSize),
						Long.valueOf(out.length())));
			}
			return expectedSize;
		} catch (InterruptedIOException e) {
			throw new IOException(JGitText.get().multiPackIndexWritingCancelled,
					e);
		}
	}

	private static long calculateExpectedSize(List<ChunkHeader> chunks) {
		int chunkLookup = (chunks.size() + 1) * CHUNK_LOOKUP_WIDTH;
		long chunkContent = chunks.stream().mapToLong(c -> c.size).sum();
		return /* header */ 12 + chunkLookup + chunkContent + /* CRC */ 20;
	}

	private List<ChunkHeader> createChunkHeaders(PackIndexMerger data) {
		List<ChunkHeader> chunkHeaders = new ArrayList<>();
		chunkHeaders.add(new ChunkHeader(MIDX_CHUNKID_OIDFANOUT,
				MULTIPACK_INDEX_FANOUT_SIZE, this::writeFanoutTable));
		chunkHeaders.add(new ChunkHeader(MIDX_CHUNKID_OIDLOOKUP,
				(long) data.getUniqueObjectCount() * OBJECT_ID_LENGTH,
				this::writeOidLookUp));
		chunkHeaders.add(new ChunkHeader(MIDX_CHUNKID_OBJECTOFFSETS,
				8L * data.getUniqueObjectCount(), this::writeObjectOffsets));
		if (data.needsLargeOffsetsChunk()) {
			chunkHeaders.add(new ChunkHeader(MIDX_CHUNKID_LARGEOFFSETS,
					8L * data.getOffsetsOver31BitsCount(),
					this::writeObjectLargeOffsets));
		}
		chunkHeaders.add(new ChunkHeader(MIDX_CHUNKID_REVINDEX,
				4L * data.getUniqueObjectCount(), this::writeRidx));

		int packNamesSize = data.getPackNames().stream()
				.mapToInt(String::length).map(i -> i + 1 /* null at the end */)
				.sum();
		chunkHeaders.add(new ChunkHeader(MIDX_CHUNKID_PACKNAMES, packNamesSize,
				this::writePackfileNames));
		return chunkHeaders;
	}

	/**
	 * Write the first 12 bytes of the multipack index.
	 * <p>
	 * These bytes include things like magic number, version, number of
	 * chunks...
	 *
	 * @param out
	 *            output stream to write
	 * @param numChunks
	 *            number of chunks this multipack index is going to have
	 * @param packCount
	 *            number of packs covered by this multipack index
	 * @throws IOException
	 *             error writing to the output stream
	 */
	private void writeHeader(CancellableDigestOutputStream out, int numChunks,
			int packCount) throws IOException {
		byte[] headerBuffer = new byte[MIDX_HEADER_SIZE];
		NB.encodeInt32(headerBuffer, 0, MIDX_SIGNATURE);
		byte[] buff = { MIDX_VERSION, OID_HASH_VERSION, (byte) numChunks,
				(byte) 0 };
		System.arraycopy(buff, 0, headerBuffer, 4, 4);
		NB.encodeInt32(headerBuffer, 8, packCount);
		out.write(headerBuffer, 0, headerBuffer.length);
		out.flush();
	}

	/**
	 * Write a table of "chunkId, start-offset", with a special value "0,
	 * end-of-previous_chunk", to mark the end.
	 *
	 * @param out
	 *            output stream to write
	 * @param chunkHeaders
	 *            list of chunks in the order they are expected to be written
	 * @throws IOException
	 *             error writing to the output stream
	 */
	private void writeChunkLookup(CancellableDigestOutputStream out,
			List<ChunkHeader> chunkHeaders) throws IOException {

		// first chunk will start at header + this lookup block
		long chunkStart = MIDX_HEADER_SIZE
				+ (long) (chunkHeaders.size() + 1) * CHUNK_LOOKUP_WIDTH;
		byte[] chunkEntry = new byte[CHUNK_LOOKUP_WIDTH];
		for (ChunkHeader chunkHeader : chunkHeaders) {
			NB.encodeInt32(chunkEntry, 0, chunkHeader.chunkId);
			NB.encodeInt64(chunkEntry, 4, chunkStart);
			out.write(chunkEntry);
			chunkStart += chunkHeader.size;
		}
		// Terminating label for the block
		// (chunkid 0, offset where the next block would start)
		NB.encodeInt32(chunkEntry, 0, 0);
		NB.encodeInt64(chunkEntry, 4, chunkStart);
		out.write(chunkEntry);
	}

	/**
	 * Write the fanout table for the object ids
	 * <p>
	 * Table with 256 entries (one byte), where the ith entry, F[i], stores the
	 * number of OIDs with first byte at most i. Thus, F[255] stores the total
	 * number of objects.
	 *
	 * @param ctx
	 *            write context
	 * @throws IOException
	 *             error writing to the output stream
	 */

	private void writeFanoutTable(WriteContext ctx) throws IOException {
		byte[] tmp = new byte[4];
		int[] fanout = new int[256];
		Iterator<MidxMutableEntry> iterator = ctx.data.bySha1Iterator();
		while (iterator.hasNext()) {
			MidxMutableEntry e = iterator.next();
			fanout[e.getObjectId().getFirstByte() & 0xff]++;
		}
		for (int i = 1; i < fanout.length; i++) {
			fanout[i] += fanout[i - 1];
		}
		for (int n : fanout) {
			NB.encodeInt32(tmp, 0, n);
			ctx.out.write(tmp, 0, 4);
		}
	}

	/**
	 * Write the OID lookup chunk
	 * <p>
	 * A list of OIDs in sha1 order.
	 *
	 * @param ctx
	 *            write context
	 * @throws IOException
	 *             error writing to the output stream
	 */
	private void writeOidLookUp(WriteContext ctx) throws IOException {
		byte[] tmp = new byte[OBJECT_ID_LENGTH];

		Iterator<MidxMutableEntry> iterator = ctx.data.bySha1Iterator();
		while (iterator.hasNext()) {
			MidxMutableEntry e = iterator.next();
			e.getObjectId().copyRawTo(tmp, 0);
			ctx.out.write(tmp, 0, OBJECT_ID_LENGTH);
		}
	}

	/**
	 * Write the object offsets chunk
	 * <p>
	 * A list of offsets, parallel to the list of OIDs. If the offset is too
	 * large (see {@link #fitsIn31bits(long)}), this contains the position in
	 * the large offsets list (marked with a 1 in the most significant bit).
	 *
	 * @param ctx
	 *            write context
	 * @throws IOException
	 *             error writing to the output stream
	 */
	private void writeObjectOffsets(WriteContext ctx) throws IOException {
		byte[] entry = new byte[8];
		Iterator<MidxMutableEntry> iterator = ctx.data.bySha1Iterator();
		while (iterator.hasNext()) {
			MidxMutableEntry e = iterator.next();
			NB.encodeInt32(entry, 0, e.getPackId());
			if (!ctx.data.needsLargeOffsetsChunk()
					|| fitsIn31bits(e.getOffset())) {
				NB.encodeInt32(entry, 4, (int) e.getOffset());
			} else {
				int offloadedPosition = ctx.largeOffsets.append(e.getOffset());
				NB.encodeInt32(entry, 4, offloadedPosition | (1 << 31));
			}
			ctx.out.write(entry);
		}
	}

	/**
	 * Writes the reverse index chunk
	 * <p>
	 * This stores the position of the objects in the main index, ordered first
	 * by pack and then by offset
	 *
	 * @param ctx
	 *            write context
	 * @throws IOException
	 *             erorr writing to the output stream
	 */
	private void writeRidx(WriteContext ctx) throws IOException {
		Map<Integer, List<OffsetPosition>> packOffsets = new HashMap<>(
				ctx.data.getPackCount());
		// TODO(ifrade): Brute force solution loading all offsets/packs in
		// memory. We could also iterate reverse indexes looking up
		// their position in the midx (and discarding if the pack doesn't
		// match).
		Iterator<MidxMutableEntry> iterator = ctx.data.bySha1Iterator();
		int midxPosition = 0;
		while (iterator.hasNext()) {
			MidxMutableEntry e = iterator.next();
			OffsetPosition op = new OffsetPosition(e.getOffset(), midxPosition);
			midxPosition++;
			packOffsets.computeIfAbsent(Integer.valueOf(e.getPackId()),
					k -> new ArrayList<>()).add(op);
		}

		for (int i = 0; i < ctx.data.getPackCount(); i++) {
			List<OffsetPosition> offsetsForPack = packOffsets
					.get(Integer.valueOf(i));
			if (offsetsForPack == null) {
				continue;
			}
			offsetsForPack.sort(Comparator.comparing(OffsetPosition::offset));
			byte[] ridxForPack = new byte[4 * offsetsForPack.size()];
			for (int j = 0; j < offsetsForPack.size(); j++) {
				NB.encodeInt32(ridxForPack, j * 4,
						offsetsForPack.get(j).position);
			}
			ctx.out.write(ridxForPack);
		}
	}

	/**
	 * Write the large offset chunk
	 * <p>
	 * A list of large offsets (long). The regular offset chunk will point to a
	 * position here.
	 *
	 * @param ctx
	 *            writer context
	 * @throws IOException
	 *             error writing to the output stream
	 */
	private void writeObjectLargeOffsets(WriteContext ctx) throws IOException {
		ctx.out.write(ctx.largeOffsets.offsets, 0,
				ctx.largeOffsets.bytePosition);
	}

	/**
	 * Write the list of packfiles chunk
	 * <p>
	 * List of packfiles (in lexicographical order) with an \0 at the end
	 *
	 * @param ctx
	 *            writer context
	 * @throws IOException
	 *             error writing to the output stream
	 */
	private void writePackfileNames(WriteContext ctx) throws IOException {
		for (String packName : ctx.data.getPackNames()) {
			// Spec doesn't talk about encoding.
			ctx.out.write(packName.getBytes(StandardCharsets.UTF_8));
			ctx.out.write(0);
		}
	}

	/**
	 * Write final checksum of the data written to the stream
	 *
	 * @param out
	 *            output stream used to write
	 * @throws IOException
	 *             error writing to the output stream
	 */
	private void writeCheckSum(CancellableDigestOutputStream out)
			throws IOException {
		out.write(out.getDigest());
		out.flush();
	}

	private record OffsetPosition(long offset, int position) {
	}

	/**
	 * If there is at least one offset value larger than 2^32-1, then the large
	 * offset chunk must exist, and offsets larger than 2^31-1 must be stored in
	 * it instead
	 *
	 * @param offset
	 *            object offset
	 *
	 * @return true if the offset fits in 31 bits
	 */
	private static boolean fitsIn31bits(long offset) {
		return offset <= LIMIT_31_BITS;
	}

	private static class LargeOffsets {
		private final byte[] offsets;

		private int bytePosition;

		LargeOffsets(int largeOffsetsCount) {
			offsets = new byte[largeOffsetsCount * 8];
			bytePosition = 0;
		}

		/**
		 * Add an offset to the large offset chunk
		 *
		 * @param largeOffset
		 *            a large offset
		 * @return the position of the just inserted offset (as in number of
		 *         offsets, NOT in bytes)
		 */
		int append(long largeOffset) {
			int at = bytePosition;
			NB.encodeInt64(offsets, at, largeOffset);
			bytePosition += 8;
			return at / 8;
		}
	}

	private record ChunkHeader(int chunkId, long size, ChunkWriter writerFn) {
	}

	@FunctionalInterface
	private interface ChunkWriter {
		void write(WriteContext ctx) throws IOException;
	}

	private static class WriteContext {
		final CancellableDigestOutputStream out;

		final PackIndexMerger data;

		final LargeOffsets largeOffsets;

		WriteContext(CancellableDigestOutputStream out, PackIndexMerger data) {
			this.out = out;
			this.data = data;
			this.largeOffsets = new LargeOffsets(
					data.getOffsetsOver31BitsCount());
		}
	}
}