Browse Source

Prune UNREACHABLE_GARBAGE packs when they expire

DfsGarbageCollector will now enforce a maximum time to live (TTL) for
UNREACHABLE_GARBAGE packs.  The default TTL is 1 day, which should be
enough time to avoid races with other processes that are inserting
data into the repository.

Change-Id: Id719e6e2a03cfc9a0c0aef8ed71d261dda14bd0c
Signed-off-by: Mike Williams <miwilliams@google.com>
tags/v4.5.0.201609210915-r
Mike Williams 8 years ago
parent
commit
fd527a2cd7

+ 239
- 0
org.eclipse.jgit.test/tst/org/eclipse/jgit/internal/storage/dfs/DfsGarbageCollectorTest.java View File

@@ -0,0 +1,239 @@
package org.eclipse.jgit.internal.storage.dfs;

import static org.eclipse.jgit.internal.storage.dfs.DfsObjDatabase.PackSource.GC;
import static org.eclipse.jgit.internal.storage.dfs.DfsObjDatabase.PackSource.INSERT;
import static org.eclipse.jgit.internal.storage.dfs.DfsObjDatabase.PackSource.UNREACHABLE_GARBAGE;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;

import java.io.IOException;
import java.util.concurrent.TimeUnit;

import org.eclipse.jgit.internal.storage.dfs.DfsObjDatabase.PackSource;
import org.eclipse.jgit.junit.TestRepository;
import org.eclipse.jgit.lib.AnyObjectId;
import org.eclipse.jgit.lib.Ref;
import org.eclipse.jgit.lib.Repository;
import org.eclipse.jgit.revwalk.RevCommit;
import org.eclipse.jgit.revwalk.RevWalk;
import org.junit.Before;
import org.junit.Test;

public class DfsGarbageCollectorTest {
private TestRepository<InMemoryRepository> git;
private InMemoryRepository repo;
private DfsObjDatabase odb;

@Before
public void setUp() throws IOException {
DfsRepositoryDescription desc = new DfsRepositoryDescription("test");
git = new TestRepository<>(new InMemoryRepository(desc));
repo = git.getRepository();
odb = repo.getObjectDatabase();
}

@Test
public void testCollectionWithNoGarbage() throws Exception {
RevCommit commit0 = commit().message("0").create();
RevCommit commit1 = commit().message("1").parent(commit0).create();
git.update("master", commit1);

assertTrue("commit0 reachable", isReachable(repo, commit0));
assertTrue("commit1 reachable", isReachable(repo, commit1));

// Packs start out as INSERT.
assertEquals(2, odb.getPacks().length);
for (DfsPackFile pack : odb.getPacks()) {
assertEquals(INSERT, pack.getPackDescription().getPackSource());
}

gcNoTtl();

// Single GC pack present with all objects.
assertEquals(1, odb.getPacks().length);
DfsPackFile pack = odb.getPacks()[0];
assertEquals(GC, pack.getPackDescription().getPackSource());
assertTrue("commit0 in pack", isObjectInPack(commit0, pack));
assertTrue("commit1 in pack", isObjectInPack(commit1, pack));
}

@Test
public void testCollectionWithGarbage() throws Exception {
RevCommit commit0 = commit().message("0").create();
RevCommit commit1 = commit().message("1").parent(commit0).create();
git.update("master", commit0);

assertTrue("commit0 reachable", isReachable(repo, commit0));
assertFalse("commit1 garbage", isReachable(repo, commit1));
gcNoTtl();

assertEquals(2, odb.getPacks().length);
DfsPackFile gc = null;
DfsPackFile garbage = null;
for (DfsPackFile pack : odb.getPacks()) {
DfsPackDescription d = pack.getPackDescription();
if (d.getPackSource() == GC) {
gc = pack;
} else if (d.getPackSource() == UNREACHABLE_GARBAGE) {
garbage = pack;
} else {
fail("unexpected " + d.getPackSource());
}
}

assertNotNull("created GC pack", gc);
assertTrue(isObjectInPack(commit0, gc));

assertNotNull("created UNREACHABLE_GARBAGE pack", garbage);
assertTrue(isObjectInPack(commit1, garbage));
}

@Test
public void testCollectionWithGarbageAndGarbagePacksPurged()
throws Exception {
RevCommit commit0 = commit().message("0").create();
RevCommit commit1 = commit().message("1").parent(commit0).create();
git.update("master", commit0);

gcNoTtl();
gcWithTtl();

// The repository has an UNREACHABLE_GARBAGE pack that could have
// expired, but since we never purge the most recent UNREACHABLE_GARBAGE
// pack, it must have survived the GC.
boolean commit1Found = false;
for (DfsPackFile pack : odb.getPacks()) {
DfsPackDescription d = pack.getPackDescription();
if (d.getPackSource() == GC) {
assertTrue("has commit0", isObjectInPack(commit0, pack));
assertFalse("no commit1", isObjectInPack(commit1, pack));
} else if (d.getPackSource() == UNREACHABLE_GARBAGE) {
commit1Found |= isObjectInPack(commit1, pack);
} else {
fail("unexpected " + d.getPackSource());
}
}
assertTrue("garbage commit1 still readable", commit1Found);

// Find oldest UNREACHABLE_GARBAGE; it will be pruned by next GC.
DfsPackDescription oldestGarbagePack = null;
for (DfsPackFile pack : odb.getPacks()) {
DfsPackDescription d = pack.getPackDescription();
if (d.getPackSource() == UNREACHABLE_GARBAGE) {
oldestGarbagePack = oldestPack(oldestGarbagePack, d);
}
}
assertNotNull("has UNREACHABLE_GARBAGE", oldestGarbagePack);

gcWithTtl();
assertTrue("has packs", odb.getPacks().length > 0);
for (DfsPackFile pack : odb.getPacks()) {
assertNotEquals(oldestGarbagePack, pack.getPackDescription());
}
}

@Test
public void testCollectionWithGarbageCoalescence() throws Exception {
RevCommit commit0 = commit().message("0").create();
RevCommit commit1 = commit().message("1").parent(commit0).create();
git.update("master", commit0);

for (int i = 0; i < 3; i++) {
commit1 = commit().message("g" + i).parent(commit1).create();

// Make sure we don't have more than 1 UNREACHABLE_GARBAGE pack
// because they're coalesced.
gcNoTtl();
assertEquals(1, countPacks(UNREACHABLE_GARBAGE));
}
}

@Test
public void testCollectionWithGarbageNoCoalescence() throws Exception {
RevCommit commit0 = commit().message("0").create();
RevCommit commit1 = commit().message("1").parent(commit0).create();
git.update("master", commit0);

for (int i = 0; i < 3; i++) {
commit1 = commit().message("g" + i).parent(commit1).create();

DfsGarbageCollector gc = new DfsGarbageCollector(repo);
gc.setCoalesceGarbageLimit(0);
gc.setGarbageTtl(0, TimeUnit.MILLISECONDS);
run(gc);
assertEquals(1 + i, countPacks(UNREACHABLE_GARBAGE));
}
}

private TestRepository<InMemoryRepository>.CommitBuilder commit() {
return git.commit();
}

private void gcNoTtl() throws IOException {
DfsGarbageCollector gc = new DfsGarbageCollector(repo);
gc.setGarbageTtl(0, TimeUnit.MILLISECONDS); // disable TTL
run(gc);
}

private void gcWithTtl() throws InterruptedException, IOException {
// Wait for the system clock to move by at least 1 millisecond.
// This allows the DfsGarbageCollector to recognize the boundary.
long start = System.currentTimeMillis();
do {
Thread.sleep(10);
} while (System.currentTimeMillis() <= start);

DfsGarbageCollector gc = new DfsGarbageCollector(repo);
gc.setGarbageTtl(1, TimeUnit.MILLISECONDS);
run(gc);
}

private void run(DfsGarbageCollector gc) throws IOException {
assertTrue("gc repacked", gc.pack(null));
odb.clearCache();
}

private static boolean isReachable(Repository repo, AnyObjectId id)
throws IOException {
try (RevWalk rw = new RevWalk(repo)) {
for (Ref ref : repo.getAllRefs().values()) {
rw.markStart(rw.parseCommit(ref.getObjectId()));
}
for (RevCommit next; (next = rw.next()) != null;) {
if (AnyObjectId.equals(next, id)) {
return true;
}
}
}
return false;
}

private boolean isObjectInPack(AnyObjectId id, DfsPackFile pack)
throws IOException {
try (DfsReader reader = new DfsReader(odb)) {
return pack.hasObject(reader, id);
}
}

private static DfsPackDescription oldestPack(DfsPackDescription a,
DfsPackDescription b) {
if (a != null && a.getLastModified() < b.getLastModified()) {
return a;
}
return b;
}

private int countPacks(PackSource source) throws IOException {
int cnt = 0;
for (DfsPackFile pack : odb.getPacks()) {
if (pack.getPackDescription().getPackSource() == source) {
cnt++;
}
}
return cnt;
}
}

+ 106
- 11
org.eclipse.jgit/src/org/eclipse/jgit/internal/storage/dfs/DfsGarbageCollector.java View File

@@ -54,9 +54,11 @@ import static org.eclipse.jgit.internal.storage.pack.PackExt.PACK;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.concurrent.TimeUnit;

import org.eclipse.jgit.internal.JGitText;
import org.eclipse.jgit.internal.storage.dfs.DfsObjDatabase.PackSource;
@@ -93,10 +95,14 @@ public class DfsGarbageCollector {

private PackConfig packConfig;

// See pack(), below, for how these two variables interact.
private long coalesceGarbageLimit = 50 << 20;
private long garbageTtlMillis = TimeUnit.DAYS.toMillis(1);

private long startTimeMillis;
private List<DfsPackFile> packsBefore;
private List<DfsPackFile> expiredGarbagePacks;

private Set<ObjectId> allHeads;
private Set<ObjectId> nonHeads;
private Set<ObjectId> txnHeads;
@@ -167,6 +173,34 @@ public class DfsGarbageCollector {
return this;
}

/**
* @return garbage packs older than this limit (in milliseconds) will be
* pruned as part of the garbage collection process if the value is
* > 0, otherwise garbage packs are retained.
*/
public long getGarbageTtlMillis() {
return garbageTtlMillis;
}

/**
* Set the time to live for garbage objects.
* <p>
* Any UNREACHABLE_GARBAGE older than this limit will be pruned at the end
* of the run.
* <p>
* If timeToLiveMillis is set to 0, UNREACHABLE_GARBAGE purging is disabled.
*
* @param ttl
* Time to live whatever unit is specified.
* @param unit
* The specified time unit.
* @return {@code this}
*/
public DfsGarbageCollector setGarbageTtl(long ttl, TimeUnit unit) {
garbageTtlMillis = unit.toMillis(ttl);
return this;
}

/**
* Create a single new pack file containing all of the live objects.
* <p>
@@ -189,6 +223,12 @@ public class DfsGarbageCollector {
if (packConfig.getIndexVersion() != 2)
throw new IllegalStateException(
JGitText.get().supportOnlyPackIndexVersion2);
if (garbageTtlMillis > 0) {
// We disable coalescing because the coalescing step will keep
// refreshing the UNREACHABLE_GARBAGE pack and we wouldn't
// actually prune anything.
coalesceGarbageLimit = 0;
}

startTimeMillis = System.currentTimeMillis();
ctx = (DfsReader) objdb.newReader();
@@ -197,9 +237,14 @@ public class DfsGarbageCollector {
objdb.clearCache();

Collection<Ref> refsBefore = getAllRefs();
packsBefore = packsToRebuild();
if (packsBefore.isEmpty())
readPacksBefore();

if (packsBefore.isEmpty()) {
if (!expiredGarbagePacks.isEmpty()) {
objdb.commitPack(noPacks(), toPrune());
}
return true;
}

allHeads = new HashSet<ObjectId>();
nonHeads = new HashSet<ObjectId>();
@@ -254,17 +299,60 @@ public class DfsGarbageCollector {
return refs;
}

private List<DfsPackFile> packsToRebuild() throws IOException {
private void readPacksBefore() throws IOException {
DfsPackFile[] packs = objdb.getPacks();
List<DfsPackFile> out = new ArrayList<DfsPackFile>(packs.length);
packsBefore = new ArrayList<DfsPackFile>(packs.length);
expiredGarbagePacks = new ArrayList<DfsPackFile>(packs.length);

long mostRecentGC = mostRecentGC(packs);
long now = System.currentTimeMillis();
for (DfsPackFile p : packs) {
DfsPackDescription d = p.getPackDescription();
if (d.getPackSource() != UNREACHABLE_GARBAGE) {
packsBefore.add(p);
} else if (packIsExpiredGarbage(d, mostRecentGC, now)) {
expiredGarbagePacks.add(p);
} else if (d.getFileSize(PackExt.PACK) < coalesceGarbageLimit) {
packsBefore.add(p);
}
}
}

private static long mostRecentGC(DfsPackFile[] packs) {
long r = 0;
for (DfsPackFile p : packs) {
DfsPackDescription d = p.getPackDescription();
if (d.getPackSource() != UNREACHABLE_GARBAGE)
out.add(p);
else if (d.getFileSize(PackExt.PACK) < coalesceGarbageLimit)
out.add(p);
if (d.getPackSource() == GC || d.getPackSource() == GC_REST) {
r = Math.max(r, d.getLastModified());
}
}
return out;
return r;
}

private boolean packIsExpiredGarbage(DfsPackDescription d,
long mostRecentGC, long now) {
// It should be safe to remove an UNREACHABLE_GARBAGE pack if it:
//
// (a) Predates the most recent prior run of this class. This check
// ensures the graph traversal algorithm had a chance to consider
// all objects in this pack and copied them into a GC or GC_REST
// pack if the graph contained live edges to the objects.
//
// This check is safe because of the ordering of packing; the GC
// packs are written first and then the UNREACHABLE_GARBAGE is
// constructed. Any UNREACHABLE_GARBAGE dated earlier than the GC
// was input to the prior GC's graph traversal.
//
// (b) Is older than garbagePackTtl. This check gives concurrent
// inserter threads sufficient time to identify an object is not
// in the graph and should have a new copy written, rather than
// relying on something from an UNREACHABLE_GARBAGE pack.
//
// Both (a) and (b) must be met to safely remove UNREACHABLE_GARBAGE.
return d.getPackSource() == UNREACHABLE_GARBAGE
&& d.getLastModified() < mostRecentGC
&& garbageTtlMillis > 0
&& now - d.getLastModified() >= garbageTtlMillis;
}

/** @return all of the source packs that fed into this compaction. */
@@ -285,8 +373,12 @@ public class DfsGarbageCollector {
private List<DfsPackDescription> toPrune() {
int cnt = packsBefore.size();
List<DfsPackDescription> all = new ArrayList<DfsPackDescription>(cnt);
for (DfsPackFile pack : packsBefore)
for (DfsPackFile pack : packsBefore) {
all.add(pack.getPackDescription());
}
for (DfsPackFile pack : expiredGarbagePacks) {
all.add(pack.getPackDescription());
}
return all;
}

@@ -329,7 +421,6 @@ public class DfsGarbageCollector {
}

private void packGarbage(ProgressMonitor pm) throws IOException {
// TODO(sop) This is ugly. The garbage pack needs to be deleted.
PackConfig cfg = new PackConfig(packConfig);
cfg.setReuseDeltas(true);
cfg.setReuseObjects(true);
@@ -420,4 +511,8 @@ public class DfsGarbageCollector {
DfsBlockCache.getInstance().getOrCreate(pack, null);
return pack;
}

private static List<DfsPackDescription> noPacks() {
return Collections.emptyList();
}
}

Loading…
Cancel
Save