From c9552abaf3286f5b277d64f78c46020fe3978aa3 Mon Sep 17 00:00:00 2001 From: Ivan Frade Date: Tue, 14 Dec 2021 16:36:54 -0800 Subject: [PATCH] PackObjectSizeIndex: interface and impl for the object-size index Operations like "clone --filter=blob:limit=N" or the "object-info" command need to read the size of the objects from the storage. An index would provide those sizes at once rather than having to seek in the packfile. Introduce an interface for the Object-size index. This index returns the inflated size of an object. Not all objects could be indexed (to limit memory usage). This implementation indexes only blobs (no trees, nor commits) *above* certain size threshold (configurable). Lower threshold adds more objects to the index, consumes more memory and provides better performance. 0 means "all blobs" and -1 "disabled". If we don't index everything, for the filter use case is more efficient to index the biggest objects first: the set is small and most objects are filtered by NOT being in the index. For the object-size, the more objects in the index the better, regardless their size. All together, it is more helpful to index above threshold. Change-Id: I9ed608ac240677e199b90ca40d420bcad9231489 --- .../file/PackObjectSizeIndexV1Test.java | 392 ++++++++++++++++++ .../storage/file/PackObjectSizeIndex.java | 45 ++ .../file/PackObjectSizeIndexLoader.java | 43 ++ .../storage/file/PackObjectSizeIndexV1.java | 224 ++++++++++ .../file/PackObjectSizeIndexWriter.java | 285 +++++++++++++ 5 files changed, 989 insertions(+) create mode 100644 org.eclipse.jgit.test/tst/org/eclipse/jgit/internal/storage/file/PackObjectSizeIndexV1Test.java create mode 100644 org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/file/PackObjectSizeIndex.java create mode 100644 org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/file/PackObjectSizeIndexLoader.java create mode 100644 org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/file/PackObjectSizeIndexV1.java create mode 100644 org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/file/PackObjectSizeIndexWriter.java diff --git a/org.eclipse.jgit.test/tst/org/eclipse/jgit/internal/storage/file/PackObjectSizeIndexV1Test.java b/org.eclipse.jgit.test/tst/org/eclipse/jgit/internal/storage/file/PackObjectSizeIndexV1Test.java new file mode 100644 index 0000000000..11e3746020 --- /dev/null +++ b/org.eclipse.jgit.test/tst/org/eclipse/jgit/internal/storage/file/PackObjectSizeIndexV1Test.java @@ -0,0 +1,392 @@ +/* + * Copyright (C) 2022, Google LLC and others + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Distribution License v. 1.0 which is available at + * https://www.eclipse.org/org/documents/edl-v10.php. + * + * SPDX-License-Identifier: BSD-3-Clause + */ +package org.eclipse.jgit.internal.storage.file; + +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import org.eclipse.jgit.lib.Constants; +import org.eclipse.jgit.lib.ObjectId; +import org.eclipse.jgit.transport.PackedObjectInfo; +import org.eclipse.jgit.util.BlockList; +import org.junit.Test; + +public class PackObjectSizeIndexV1Test { + private static final ObjectId OBJ_ID = ObjectId + .fromString("b8b1d53172fb3fb19647adce4b38fab4371c2454"); + + private static final long GB = 1 << 30; + + private static final int MAX_24BITS_UINT = 0xffffff; + + @Test + public void write_24bPositions_32bSizes() throws IOException { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + PackObjectSizeIndexWriter writer = PackObjectSizeIndexWriter + .createWriter(out, 0); + List objs = new ArrayList<>(); + objs.add(blobWithSize(100)); + objs.add(blobWithSize(400)); + objs.add(blobWithSize(200)); + + writer.write(objs); + byte[] expected = new byte[] { -1, 's', 'i', 'z', // header + 0x01, // version + 0x00, 0x00, 0x00, 0x00, // minimum object size + 0x00, 0x00, 0x00, 0x03, // obj count + 0x18, // Unsigned 3 bytes + 0x00, 0x00, 0x00, 0x03, // 3 positions + 0x00, 0x00, 0x00, // positions + 0x00, 0x00, 0x01, // + 0x00, 0x00, 0x02, // + 0x00, // No more positions + 0x00, 0x00, 0x00, 0x64, // size 100 + 0x00, 0x00, 0x01, (byte) 0x90, // size 400 + 0x00, 0x00, 0x00, (byte) 0xc8, // size 200 + 0x00, 0x00, 0x00, 0x00 // 64bit sizes counter + }; + byte[] output = out.toByteArray(); + assertArrayEquals(expected, output); + } + + @Test + public void write_32bPositions_32bSizes() throws IOException { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + PackObjectSizeIndexWriter writer = PackObjectSizeIndexWriter + .createWriter(out, 0); + List objs = new BlockList<>(9_000_000); + // The 24 bit range is full of commits and trees + PackedObjectInfo commit = objInfo(Constants.OBJ_COMMIT, 100); + for (int i = 0; i <= MAX_24BITS_UINT; i++) { + objs.add(commit); + } + objs.add(blobWithSize(100)); + objs.add(blobWithSize(400)); + objs.add(blobWithSize(200)); + + writer.write(objs); + byte[] expected = new byte[] { -1, 's', 'i', 'z', // header + 0x01, // version + 0x00, 0x00, 0x00, 0x00, // minimum object size + 0x00, 0x00, 0x00, 0x03, // obj count + (byte) 0x20, // Signed 4 bytes + 0x00, 0x00, 0x00, 0x03, // 3 positions + 0x01, 0x00, 0x00, 0x00, // positions + 0x01, 0x00, 0x00, 0x01, // + 0x01, 0x00, 0x00, 0x02, // + 0x00, // No more positions + 0x00, 0x00, 0x00, 0x64, // size 100 + 0x00, 0x00, 0x01, (byte) 0x90, // size 400 + 0x00, 0x00, 0x00, (byte) 0xc8, // size 200 + 0x00, 0x00, 0x00, 0x00 // 64bit sizes counter + }; + byte[] output = out.toByteArray(); + assertArrayEquals(expected, output); + } + + @Test + public void write_24b32bPositions_32bSizes() throws IOException { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + PackObjectSizeIndexWriter writer = PackObjectSizeIndexWriter + .createWriter(out, 0); + List objs = new BlockList<>(9_000_000); + // The 24 bit range is full of commits and trees + PackedObjectInfo commit = objInfo(Constants.OBJ_COMMIT, 100); + for (int i = 0; i < MAX_24BITS_UINT; i++) { + objs.add(commit); + } + objs.add(blobWithSize(100)); + objs.add(blobWithSize(400)); + objs.add(blobWithSize(200)); + + writer.write(objs); + byte[] expected = new byte[] { -1, 's', 'i', 'z', // header + 0x01, // version + 0x00, 0x00, 0x00, 0x00, // minimum object size + 0x00, 0x00, 0x00, 0x03, // obj count + 0x18, // 3 bytes + 0x00, 0x00, 0x00, 0x01, // 1 position + (byte) 0xff, (byte) 0xff, (byte) 0xff, + (byte) 0x20, // 4 bytes (32 bits) + 0x00, 0x00, 0x00, 0x02, // 2 positions + 0x01, 0x00, 0x00, 0x00, // positions + 0x01, 0x00, 0x00, 0x01, // + 0x00, // No more positions + 0x00, 0x00, 0x00, 0x64, // size 100 + 0x00, 0x00, 0x01, (byte) 0x90, // size 400 + 0x00, 0x00, 0x00, (byte) 0xc8, // size 200 + 0x00, 0x00, 0x00, 0x00 // 64bit sizes counter + }; + byte[] output = out.toByteArray(); + assertArrayEquals(expected, output); + } + + @Test + public void write_64bitsSize() throws IOException { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + PackObjectSizeIndexWriter writer = PackObjectSizeIndexWriter + .createWriter(out, 0); + List objs = new ArrayList<>(); + objs.add(blobWithSize(100)); + objs.add(blobWithSize(3 * GB)); + objs.add(blobWithSize(4 * GB)); + objs.add(blobWithSize(400)); + + writer.write(objs); + byte[] expected = new byte[] { -1, 's', 'i', 'z', // header + 0x01, // version + 0x00, 0x00, 0x00, 0x00, // minimum object size + 0x00, 0x00, 0x00, 0x04, // Object count + 0x18, // Unsigned 3 byte positions + 0x00, 0x00, 0x00, 0x04, // 4 positions + 0x00, 0x00, 0x00, // positions + 0x00, 0x00, 0x01, // + 0x00, 0x00, 0x02, // + 0x00, 0x00, 0x03, // + 0x00, // No more positions + 0x00, 0x00, 0x00, 0x64, // size 100 + (byte) 0xff, (byte) 0xff, (byte) 0xff, (byte) 0xff, // -1 (3GB) + (byte) 0xff, (byte) 0xff, (byte) 0xff, (byte) 0xfe, // -2 (4GB) + 0x00, 0x00, 0x01, (byte) 0x90, // size 400 + 0x00, 0x00, 0x00, (byte) 0x02, // 64bit sizes counter (2) + 0x00, 0x00, 0x00, 0x00, // size 3Gb + (byte) 0xc0, 0x00, 0x00, 0x00, // + 0x00, 0x00, 0x00, 0x01, // size 4GB + (byte) 0x00, 0x00, 0x00, 0x00, // + 0x00, 0x00, 0x00, 0x00 // 128bit sizes counter + }; + byte[] output = out.toByteArray(); + assertArrayEquals(expected, output); + } + + @Test + public void write_allObjectsTooSmall() throws IOException { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + PackObjectSizeIndexWriter writer = PackObjectSizeIndexWriter + .createWriter(out, 1 << 10); + List objs = new ArrayList<>(); + // too small blobs + objs.add(blobWithSize(100)); + objs.add(blobWithSize(200)); + objs.add(blobWithSize(400)); + + writer.write(objs); + byte[] expected = new byte[] { -1, 's', 'i', 'z', // header + 0x01, // version + 0x00, 0x00, 0x04, 0x00, // minimum object size + 0x00, 0x00, 0x00, 0x00, // Object count + }; + byte[] output = out.toByteArray(); + assertArrayEquals(expected, output); + } + + @Test + public void write_onlyBlobsIndexed() throws IOException { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + PackObjectSizeIndexWriter writer = PackObjectSizeIndexWriter + .createWriter(out, 0); + List objs = new ArrayList<>(); + objs.add(objInfo(Constants.OBJ_COMMIT, 1000)); + objs.add(blobWithSize(100)); + objs.add(objInfo(Constants.OBJ_TAG, 1000)); + objs.add(blobWithSize(400)); + objs.add(blobWithSize(200)); + + writer.write(objs); + byte[] expected = new byte[] { -1, 's', 'i', 'z', // header + 0x01, // version + 0x00, 0x00, 0x00, 0x00, // minimum object size + 0x00, 0x00, 0x00, 0x03, // Object count + 0x18, // Positions in 3 bytes + 0x00, 0x00, 0x00, 0x03, // 3 entries + 0x00, 0x00, 0x01, // positions + 0x00, 0x00, 0x03, // + 0x00, 0x00, 0x04, // + 0x00, // No more positions + 0x00, 0x00, 0x00, 0x64, // size 100 + 0x00, 0x00, 0x01, (byte) 0x90, // size 400 + 0x00, 0x00, 0x00, (byte) 0xc8, // size 200 + 0x00, 0x00, 0x00, 0x00 // 64bit sizes counter + }; + byte[] output = out.toByteArray(); + assertArrayEquals(expected, output); + } + + @Test + public void write_noObjects() throws IOException { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + PackObjectSizeIndexWriter writer = PackObjectSizeIndexWriter + .createWriter(out, 0); + List objs = new ArrayList<>(); + + writer.write(objs); + byte[] expected = new byte[] { -1, 's', 'i', 'z', // header + 0x01, // version + 0x00, 0x00, 0x00, 0x00, // minimum object size + 0x00, 0x00, 0x00, 0x00, // Object count + }; + byte[] output = out.toByteArray(); + assertArrayEquals(expected, output); + } + + @Test + public void read_empty() throws IOException { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + PackObjectSizeIndexWriter writer = PackObjectSizeIndexWriter + .createWriter(out, 0); + List objs = new ArrayList<>(); + + writer.write(objs); + ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray()); + PackObjectSizeIndex index = PackObjectSizeIndexLoader.load(in); + assertEquals(-1, index.getSize(0)); + assertEquals(-1, index.getSize(1)); + assertEquals(-1, index.getSize(1 << 30)); + assertEquals(0, index.getThreshold()); + } + + @Test + public void read_only24bitsPositions() throws IOException { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + PackObjectSizeIndexWriter writer = PackObjectSizeIndexWriter + .createWriter(out, 0); + List objs = new ArrayList<>(); + objs.add(blobWithSize(100)); + objs.add(blobWithSize(200)); + objs.add(blobWithSize(400)); + objs.add(blobWithSize(1500)); + + writer.write(objs); + ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray()); + PackObjectSizeIndex index = PackObjectSizeIndexLoader.load(in); + assertEquals(100, index.getSize(0)); + assertEquals(200, index.getSize(1)); + assertEquals(400, index.getSize(2)); + assertEquals(1500, index.getSize(3)); + assertEquals(0, index.getThreshold()); + } + + @Test + public void read_only32bitsPositions() throws IOException { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + PackObjectSizeIndexWriter writer = PackObjectSizeIndexWriter + .createWriter(out, 2000); + List objs = new ArrayList<>(); + PackedObjectInfo smallObj = blobWithSize(100); + for (int i = 0; i <= MAX_24BITS_UINT; i++) { + objs.add(smallObj); + } + objs.add(blobWithSize(1000)); + objs.add(blobWithSize(3000)); + objs.add(blobWithSize(2500)); + objs.add(blobWithSize(1000)); + + writer.write(objs); + ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray()); + PackObjectSizeIndex index = PackObjectSizeIndexLoader.load(in); + assertEquals(-1, index.getSize(5)); + assertEquals(-1, index.getSize(MAX_24BITS_UINT+1)); + assertEquals(3000, index.getSize(MAX_24BITS_UINT+2)); + assertEquals(2500, index.getSize(MAX_24BITS_UINT+3)); + assertEquals(-1, index.getSize(MAX_24BITS_UINT+4)); // Not indexed + assertEquals(2000, index.getThreshold()); + } + + @Test + public void read_24and32BitsPositions() throws IOException { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + PackObjectSizeIndexWriter writer = PackObjectSizeIndexWriter + .createWriter(out, 2000); + List objs = new ArrayList<>(); + PackedObjectInfo smallObj = blobWithSize(100); + for (int i = 0; i <= MAX_24BITS_UINT; i++) { + if (i == 500 || i == 1000 || i == 1500) { + objs.add(blobWithSize(2500)); + continue; + } + objs.add(smallObj); + } + objs.add(blobWithSize(3000)); + objs.add(blobWithSize(1000)); + objs.add(blobWithSize(2500)); + objs.add(blobWithSize(1000)); + + writer.write(objs); + ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray()); + PackObjectSizeIndex index = PackObjectSizeIndexLoader.load(in); + // 24 bit positions + assertEquals(-1, index.getSize(5)); + assertEquals(2500, index.getSize(500)); + // 32 bit positions + assertEquals(3000, index.getSize(MAX_24BITS_UINT+1)); + assertEquals(-1, index.getSize(MAX_24BITS_UINT+2)); + assertEquals(2500, index.getSize(MAX_24BITS_UINT+3)); + assertEquals(-1, index.getSize(MAX_24BITS_UINT+4)); // Not indexed + assertEquals(2000, index.getThreshold()); + } + + @Test + public void read_only64bits() throws IOException { + ByteArrayOutputStream out = new ByteArrayOutputStream(); + PackObjectSizeIndexWriter writer = PackObjectSizeIndexWriter + .createWriter(out, 0); + List objs = new ArrayList<>(); + objs.add(blobWithSize(3 * GB)); + objs.add(blobWithSize(8 * GB)); + + writer.write(objs); + ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray()); + PackObjectSizeIndex index = PackObjectSizeIndexLoader.load(in); + assertEquals(3 * GB, index.getSize(0)); + assertEquals(8 * GB, index.getSize(1)); + assertEquals(0, index.getThreshold()); + } + + @Test + public void read_withMinSize() throws IOException { + int minSize = 1000; + ByteArrayOutputStream out = new ByteArrayOutputStream(); + PackObjectSizeIndexWriter writer = PackObjectSizeIndexWriter + .createWriter(out, minSize); + List objs = new ArrayList<>(); + objs.add(blobWithSize(3 * GB)); + objs.add(blobWithSize(1500)); + objs.add(blobWithSize(500)); + objs.add(blobWithSize(1000)); + objs.add(blobWithSize(2000)); + + writer.write(objs); + ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray()); + PackObjectSizeIndex index = PackObjectSizeIndexLoader.load(in); + assertEquals(3 * GB, index.getSize(0)); + assertEquals(1500, index.getSize(1)); + assertEquals(-1, index.getSize(2)); + assertEquals(1000, index.getSize(3)); + assertEquals(2000, index.getSize(4)); + assertEquals(minSize, index.getThreshold()); + } + + private static PackedObjectInfo blobWithSize(long size) { + return objInfo(Constants.OBJ_BLOB, size); + } + + private static PackedObjectInfo objInfo(int type, long size) { + PackedObjectInfo objectInfo = new PackedObjectInfo(OBJ_ID); + objectInfo.setType(type); + objectInfo.setFullSize(size); + return objectInfo; + } +} diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/file/PackObjectSizeIndex.java b/org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/file/PackObjectSizeIndex.java new file mode 100644 index 0000000000..1c3797c509 --- /dev/null +++ b/org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/file/PackObjectSizeIndex.java @@ -0,0 +1,45 @@ +/* + * Copyright (C) 2022, Google LLC and others + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Distribution License v. 1.0 which is available at + * https://www.eclipse.org/org/documents/edl-v10.php. + * + * SPDX-License-Identifier: BSD-3-Clause + */ +package org.eclipse.jgit.internal.storage.file; + +/** + * Index of object sizes in a pack + * + * It is not guaranteed that the implementation contains the sizes of all + * objects (e.g. it could store only objects over certain threshold). + */ +public interface PackObjectSizeIndex { + + /** + * Returns the inflated size of the object. + * + * @param idxOffset + * position in the pack (as returned from PackIndex) + * @return size of the object, -1 if not found in the index. + */ + long getSize(int idxOffset); + + /** + * Number of objects in the index + * + * @return number of objects in the index + */ + long getObjectCount(); + + + /** + * Minimal size of an object to be included in this index + * + * Cut-off value used at generation time to decide what objects to index. + * + * @return size in bytes + */ + int getThreshold(); +} diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/file/PackObjectSizeIndexLoader.java b/org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/file/PackObjectSizeIndexLoader.java new file mode 100644 index 0000000000..9d6941823a --- /dev/null +++ b/org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/file/PackObjectSizeIndexLoader.java @@ -0,0 +1,43 @@ +/* + * Copyright (C) 2022, Google LLC and others + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Distribution License v. 1.0 which is available at + * https://www.eclipse.org/org/documents/edl-v10.php. + * + * SPDX-License-Identifier: BSD-3-Clause + */ +package org.eclipse.jgit.internal.storage.file; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Arrays; + +/** + * Chooses the specific implementation of the object-size index based on the + * file version. + */ +public class PackObjectSizeIndexLoader { + + /** + * Read an object size index from the stream + * + * @param in + * input stream at the beginning of the object size data + * @return an implementation of the object size index + * @throws IOException + * error reading the streams + */ + public static PackObjectSizeIndex load(InputStream in) throws IOException { + byte[] header = in.readNBytes(4); + if (!Arrays.equals(header, PackObjectSizeIndexWriter.HEADER)) { + throw new IOException("Stream is not an object index"); //$NON-NLS-1$ + } + + int version = in.readNBytes(1)[0]; + if (version != 1) { + throw new IOException("Unknown object size version: " + version); //$NON-NLS-1$ + } + return PackObjectSizeIndexV1.parse(in); + } +} diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/file/PackObjectSizeIndexV1.java b/org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/file/PackObjectSizeIndexV1.java new file mode 100644 index 0000000000..68befc6ca5 --- /dev/null +++ b/org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/file/PackObjectSizeIndexV1.java @@ -0,0 +1,224 @@ +/* + * Copyright (C) 2022, Google LLC and others + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Distribution License v. 1.0 which is available at + * https://www.eclipse.org/org/documents/edl-v10.php. + * + * SPDX-License-Identifier: BSD-3-Clause + */ +package org.eclipse.jgit.internal.storage.file; + +import java.io.IOException; +import java.io.InputStream; +import java.io.UnsupportedEncodingException; +import java.util.Arrays; + +import org.eclipse.jgit.util.NB; + +/** + * Memory representation of the object-size index + * + * The object size index is a map from position in the primary idx (i.e. + * position of the object-id in lexicographical order) to size. + * + * Most of the positions fit in unsigned 3 bytes (up to 16 million) + */ +class PackObjectSizeIndexV1 implements PackObjectSizeIndex { + + private static final byte BITS_24 = 0x18; + + private static final byte BITS_32 = 0x20; + + private final int threshold; + + private final UInt24Array positions24; + + private final int[] positions32; + + /** + * Parallel array to concat(positions24, positions32) with the size of the + * objects. + * + * A value >= 0 is the size of the object. A negative value means the size + * doesn't fit in an int and |value|-1 is the position for the size in the + * size64 array e.g. a value of -1 is sizes64[0], -2 = sizes64[1], ... + */ + private final int[] sizes32; + + private final long[] sizes64; + + static PackObjectSizeIndex parse(InputStream in) throws IOException { + /** Header and version already out of the input */ + IndexInputStreamReader stream = new IndexInputStreamReader(in); + int threshold = stream.readInt(); // minSize + int objCount = stream.readInt(); + if (objCount == 0) { + return new EmptyPackObjectSizeIndex(threshold); + } + return new PackObjectSizeIndexV1(stream, threshold, objCount); + } + + private PackObjectSizeIndexV1(IndexInputStreamReader stream, int threshold, + int objCount) throws IOException { + this.threshold = threshold; + UInt24Array pos24 = null; + int[] pos32 = null; + + byte positionEncoding; + while ((positionEncoding = stream.readByte()) != 0) { + if (Byte.compareUnsigned(positionEncoding, BITS_24) == 0) { + int sz = stream.readInt(); + pos24 = new UInt24Array(stream.readNBytes(sz * 3)); + } else if (Byte.compareUnsigned(positionEncoding, BITS_32) == 0) { + int sz = stream.readInt(); + pos32 = stream.readIntArray(sz); + } else { + throw new UnsupportedEncodingException( + String.format("Unknown position encoding %s", + Integer.toHexString(positionEncoding))); + } + } + positions24 = pos24 != null ? pos24 : UInt24Array.EMPTY; + positions32 = pos32 != null ? pos32 : new int[0]; + + sizes32 = stream.readIntArray(objCount); + int c64sizes = stream.readInt(); + if (c64sizes == 0) { + sizes64 = new long[0]; + return; + } + sizes64 = stream.readLongArray(c64sizes); + int c128sizes = stream.readInt(); + if (c128sizes != 0) { + // this MUST be 0 (we don't support 128 bits sizes yet) + throw new IOException("Unsupported sizes in object-size-index"); + } + } + + @Override + public long getSize(int idxOffset) { + int pos = -1; + if (!positions24.isEmpty() && idxOffset <= positions24.getLastValue()) { + pos = positions24.binarySearch(idxOffset); + } else if (positions32.length > 0 && idxOffset >= positions32[0]) { + int pos32 = Arrays.binarySearch(positions32, idxOffset); + if (pos32 >= 0) { + pos = pos32 + positions24.size(); + } + } + if (pos < 0) { + return -1; + } + + int objSize = sizes32[pos]; + if (objSize < 0) { + int secondPos = Math.abs(objSize) - 1; + return sizes64[secondPos]; + } + return objSize; + } + + @Override + public long getObjectCount() { + return positions24.size() + positions32.length; + } + + @Override + public int getThreshold() { + return threshold; + } + + /** + * Wrapper to read parsed content from the byte stream + */ + private static class IndexInputStreamReader { + + private final byte[] buffer = new byte[8]; + + private final InputStream in; + + IndexInputStreamReader(InputStream in) { + this.in = in; + } + + int readInt() throws IOException { + int n = in.readNBytes(buffer, 0, 4); + if (n < 4) { + throw new IOException( + "Unable to read a full int from the stream"); + } + return NB.decodeInt32(buffer, 0); + } + + int[] readIntArray(int intsCount) throws IOException { + if (intsCount == 0) { + return new int[0]; + } + + int[] dest = new int[intsCount]; + for (int i = 0; i < intsCount; i++) { + dest[i] = readInt(); + } + return dest; + } + + long readLong() throws IOException { + int n = in.readNBytes(buffer, 0, 8); + if (n < 8) { + throw new IOException( + "Unable to read a full int from the stream"); + } + return NB.decodeInt64(buffer, 0); + } + + long[] readLongArray(int longsCount) throws IOException { + if (longsCount == 0) { + return new long[0]; + } + + long[] dest = new long[longsCount]; + for (int i = 0; i < longsCount; i++) { + dest[i] = readLong(); + } + return dest; + } + + byte readByte() throws IOException { + int n = in.readNBytes(buffer, 0, 1); + if (n != 1) { + throw new IOException("Cannot read byte from stream"); + } + return buffer[0]; + } + + byte[] readNBytes(int sz) throws IOException { + return in.readNBytes(sz); + } + } + + private static class EmptyPackObjectSizeIndex + implements PackObjectSizeIndex { + + private final int threshold; + + EmptyPackObjectSizeIndex(int threshold) { + this.threshold = threshold; + } + + @Override + public long getSize(int idxOffset) { + return -1; + } + + @Override + public long getObjectCount() { + return 0; + } + + @Override + public int getThreshold() { + return threshold; + } + } +} diff --git a/org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/file/PackObjectSizeIndexWriter.java b/org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/file/PackObjectSizeIndexWriter.java new file mode 100644 index 0000000000..ae03797cad --- /dev/null +++ b/org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/file/PackObjectSizeIndexWriter.java @@ -0,0 +1,285 @@ +/* + * Copyright (C) 2022, Google LLC and others + * + * This program and the accompanying materials are made available under the + * terms of the Eclipse Distribution License v. 1.0 which is available at + * https://www.eclipse.org/org/documents/edl-v10.php. + * + * SPDX-License-Identifier: BSD-3-Clause + */ +package org.eclipse.jgit.internal.storage.file; + +import java.io.BufferedOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.util.List; + +import org.eclipse.jgit.lib.Constants; +import org.eclipse.jgit.transport.PackedObjectInfo; +import org.eclipse.jgit.util.NB; + +/** + * Write an object index in the output stream + */ +public abstract class PackObjectSizeIndexWriter { + + private static final int MAX_24BITS_UINT = 0xffffff; + + private static final PackObjectSizeIndexWriter NULL_WRITER = new PackObjectSizeIndexWriter() { + @Override + public void write(List objs) { + // Do nothing + } + }; + + /** Magic constant for the object size index file */ + protected static final byte[] HEADER = { -1, 's', 'i', 'z' }; + + /** + * Returns a writer for the latest index version + * + * @param os + * Output stream where to write the index + * @param minSize + * objects strictly smaller than this size won't be added to the + * index. Negative size won't write AT ALL. Other sizes could write + * an empty index. + * @return the index writer + */ + public static PackObjectSizeIndexWriter createWriter(OutputStream os, + int minSize) { + if (minSize < 0) { + return NULL_WRITER; + } + return new PackObjectSizeWriterV1(os, minSize); + } + + /** + * Add the objects to the index + * + * @param objs + * objects in the pack, in sha1 order. Their position in the list + * matches their position in the primary index. + * @throws IOException + * problem writing to the stream + */ + public abstract void write(List objs) + throws IOException; + + /** + * Object size index v1. + * + * Store position (in the main index) to size as parallel arrays. + * + *

Positions in the main index fit well in unsigned 24 bits (16M) for most + * repositories, but some outliers have even more objects, so we need to + * store also 32 bits positions. + * + *

Sizes are stored as a first array parallel to positions. If a size + * doesn't fit in an element of that array, then we encode there a position + * on the next-size array. This "overflow" array doesn't have entries for + * all positions. + * + *

+	 *
+	 *      positions       [10, 500, 1000, 1001]
+	 *      sizes (32bits)  [15MB, -1, 6MB, -2]
+   *                          ___/  ______/
+	 *                        /     /
+	 *      sizes (64 bits) [3GB, 6GB]
+	 * 
+ * + *

For sizes we use 32 bits as the first level and 64 for the rare objects + * over 2GB. + * + *

A 24/32/64 bits hierarchy of arrays saves space if we have a lot of small + * objects, but wastes space if we have only big ones. The min size to index is + * controlled by conf and in principle we want to index only rather + * big objects (e.g. > 10MB). We could support more dynamics read/write of sizes + * (e.g. 24 only if the threshold will include many of those objects) but it + * complicates a lot code and spec. If needed it could go for a v2 of the protocol. + * + *

Format: + * + *

  • A header with the magic number (4 bytes) + *
  • The index version (1 byte) + *
  • The minimum object size (4 bytes) + *
  • Total count of objects indexed (C, 4 bytes) + * (if count == 0, stop here) + * + * Blocks of + *
  • Size per entry in bits (1 byte, either 24 (0x18) or 32 (0x20)) + *
  • Count of entries (4 bytes) (c, as a signed int) + *
  • positions encoded in s bytes each (i.e s*c bytes) + * + *
  • 0 (as a "size-per-entry = 0", marking end of the section) + * + *
  • 32 bit sizes (C * 4 bytes). Negative size means + * nextLevel[abs(size)-1] + *
  • Count of 64 bit sizes (s64) (or 0 if no more indirections) + *
  • 64 bit sizes (s64 * 8 bytes) + *
  • 0 (end) + */ + static class PackObjectSizeWriterV1 extends PackObjectSizeIndexWriter { + + private final OutputStream os; + + private final int minObjSize; + + private final byte[] intBuffer = new byte[4]; + + PackObjectSizeWriterV1(OutputStream os, int minSize) { + this.os = new BufferedOutputStream(os); + this.minObjSize = minSize; + } + + @Override + public void write(List allObjects) + throws IOException { + os.write(HEADER); + writeUInt8(1); // Version + writeInt32(minObjSize); + + PackedObjectStats stats = countIndexableObjects(allObjects); + int[] indexablePositions = findIndexablePositions(allObjects, + stats.indexableObjs); + writeInt32(indexablePositions.length); // Total # of objects + if (indexablePositions.length == 0) { + os.flush(); + return; + } + + // Positions that fit in 3 bytes + if (stats.pos24Bits > 0) { + writeUInt8(24); + writeInt32(stats.pos24Bits); + applyToRange(indexablePositions, 0, stats.pos24Bits, + this::writeInt24); + } + // Positions that fit in 4 bytes + // We only use 31 bits due to sign, + // but that covers 2 billion objs + if (stats.pos31Bits > 0) { + writeUInt8(32); + writeInt32(stats.pos31Bits); + applyToRange(indexablePositions, stats.pos24Bits, + stats.pos24Bits + stats.pos31Bits, this::writeInt32); + } + writeUInt8(0); + writeSizes(allObjects, indexablePositions, stats.sizeOver2GB); + os.flush(); + } + + private void writeUInt8(int i) throws IOException { + if (i > 255) { + throw new IllegalStateException( + "Number doesn't fit in a single byte"); + } + NB.encodeInt32(intBuffer, 0, i); + os.write(intBuffer, 3, 1); + } + + private void writeInt24(int i) throws IOException { + NB.encodeInt24(intBuffer, 1, i); + os.write(intBuffer, 1, 3); + } + + private void writeInt32(int i) throws IOException { + NB.encodeInt32(intBuffer, 0, i); + os.write(intBuffer); + } + + private void writeSizes(List allObjects, + int[] indexablePositions, int objsBiggerThan2Gb) + throws IOException { + if (indexablePositions.length == 0) { + writeInt32(0); + return; + } + + byte[] sizes64bits = new byte[8 * objsBiggerThan2Gb]; + int s64 = 0; + for (int i = 0; i < indexablePositions.length; i++) { + PackedObjectInfo info = allObjects.get(indexablePositions[i]); + if (info.getFullSize() < Integer.MAX_VALUE) { + writeInt32((int) info.getFullSize()); + } else { + // Size needs more than 32 bits. Store -1 * offset in the + // next table as size. + writeInt32(-1 * (s64 + 1)); + NB.encodeInt64(sizes64bits, s64 * 8, info.getFullSize()); + s64++; + } + } + if (objsBiggerThan2Gb > 0) { + writeInt32(objsBiggerThan2Gb); + os.write(sizes64bits); + } + writeInt32(0); + } + + private int[] findIndexablePositions( + List allObjects, + int indexableObjs) { + int[] positions = new int[indexableObjs]; + int positionIdx = 0; + for (int i = 0; i < allObjects.size(); i++) { + PackedObjectInfo o = allObjects.get(i); + if (!shouldIndex(o)) { + continue; + } + positions[positionIdx++] = i; + } + return positions; + } + + private PackedObjectStats countIndexableObjects( + List objs) { + PackedObjectStats stats = new PackedObjectStats(); + for (int i = 0; i < objs.size(); i++) { + PackedObjectInfo o = objs.get(i); + if (!shouldIndex(o)) { + continue; + } + stats.indexableObjs++; + if (o.getFullSize() > Integer.MAX_VALUE) { + stats.sizeOver2GB++; + } + if (i <= MAX_24BITS_UINT) { + stats.pos24Bits++; + } else { + stats.pos31Bits++; + // i is a positive int, cannot be bigger than this + } + } + return stats; + } + + private boolean shouldIndex(PackedObjectInfo o) { + return (o.getType() == Constants.OBJ_BLOB) + && (o.getFullSize() >= minObjSize); + } + + private static class PackedObjectStats { + int indexableObjs; + + int pos24Bits; + + int pos31Bits; + + int sizeOver2GB; + } + + @FunctionalInterface + interface IntEncoder { + void encode(int i) throws IOException; + } + + private static void applyToRange(int[] allPositions, int start, int end, + IntEncoder encoder) throws IOException { + for (int i = start; i < end; i++) { + encoder.encode(allPositions[i]); + } + } + } +} -- 2.39.5