From 934ada531b4cf78b76f5b458190c1f057ff770c9 Mon Sep 17 00:00:00 2001 From: James Moger Date: Mon, 12 Mar 2012 20:52:34 -0400 Subject: [PATCH] Revised reindex method per JGit team recommendation --- src/com/gitblit/LuceneExecutor.java | 9 +- src/com/gitblit/models/SearchResult.java | 2 + src/com/gitblit/utils/LuceneUtils.java | 234 +++++++++++-------- tests/com/gitblit/tests/LuceneUtilsTest.java | 26 +-- 4 files changed, 157 insertions(+), 114 deletions(-) diff --git a/src/com/gitblit/LuceneExecutor.java b/src/com/gitblit/LuceneExecutor.java index 4f06b4ec..c9e4c73e 100644 --- a/src/com/gitblit/LuceneExecutor.java +++ b/src/com/gitblit/LuceneExecutor.java @@ -121,6 +121,7 @@ public class LuceneExecutor implements Runnable { } index(name, repository); repository.close(); + System.gc(); processed.add(name); } catch (Throwable e) { logger.error(MessageFormat.format("Failed to update {0} Lucene index", @@ -145,18 +146,16 @@ public class LuceneExecutor implements Runnable { if (LuceneUtils.shouldReindex(repository)) { // (re)build the entire index long start = System.currentTimeMillis(); - String msg = "Building {0} Lucene index..."; - logger.info(MessageFormat.format(msg, name)); - IndexResult result = LuceneUtils.reindex(name, repository, true); + IndexResult result = LuceneUtils.reindex(name, repository); float duration = (System.currentTimeMillis() - start)/1000f; if (result.success) { if (result.commitCount > 0) { - msg = "Built {0} Lucene index from {1} commits and {2} files across {3} branches in {4} secs"; + String msg = "Built {0} Lucene index from {1} commits and {2} files across {3} branches in {4} secs"; logger.info(MessageFormat.format(msg, name, result.commitCount, result.blobCount, result.branchCount, duration)); } } else { - msg = "Could not build {0} Lucene index!"; + String msg = "Could not build {0} Lucene index!"; logger.error(MessageFormat.format(msg, name)); } } else { diff --git a/src/com/gitblit/models/SearchResult.java b/src/com/gitblit/models/SearchResult.java index ffe4d870..c74229a9 100644 --- a/src/com/gitblit/models/SearchResult.java +++ b/src/com/gitblit/models/SearchResult.java @@ -26,6 +26,8 @@ public class SearchResult implements Serializable { public String summary; + public String content; + public String repository; public String branch; diff --git a/src/com/gitblit/utils/LuceneUtils.java b/src/com/gitblit/utils/LuceneUtils.java index 3b3c30e1..3c2606bc 100644 --- a/src/com/gitblit/utils/LuceneUtils.java +++ b/src/com/gitblit/utils/LuceneUtils.java @@ -15,6 +15,8 @@ */ package com.gitblit.utils; +import static org.eclipse.jgit.treewalk.filter.TreeFilter.ANY_DIFF; + import java.io.ByteArrayOutputStream; import java.io.File; import java.io.IOException; @@ -23,11 +25,13 @@ import java.text.ParseException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; +import java.util.Comparator; import java.util.HashMap; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.TreeMap; import java.util.TreeSet; import java.util.concurrent.ConcurrentHashMap; @@ -56,18 +60,16 @@ import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.eclipse.jgit.diff.DiffEntry.ChangeType; import org.eclipse.jgit.lib.Constants; -import org.eclipse.jgit.lib.FileMode; import org.eclipse.jgit.lib.ObjectId; import org.eclipse.jgit.lib.ObjectLoader; +import org.eclipse.jgit.lib.ObjectReader; import org.eclipse.jgit.lib.Repository; import org.eclipse.jgit.revwalk.RevCommit; -import org.eclipse.jgit.revwalk.RevObject; +import org.eclipse.jgit.revwalk.RevTree; import org.eclipse.jgit.revwalk.RevWalk; import org.eclipse.jgit.storage.file.FileBasedConfig; +import org.eclipse.jgit.treewalk.EmptyTreeIterator; import org.eclipse.jgit.treewalk.TreeWalk; -import org.eclipse.jgit.treewalk.filter.AndTreeFilter; -import org.eclipse.jgit.treewalk.filter.PathFilterGroup; -import org.eclipse.jgit.treewalk.filter.TreeFilter; import org.eclipse.jgit.util.FS; import com.gitblit.models.IssueModel; @@ -241,15 +243,9 @@ public class LuceneUtils { * * @param repositoryName * @param repository - * @param fullIndex - * If false blob metadata is set to the HEAD revision of each - * branch. If true, each the last commit of each blob is - * determined to properly index the author, committer, and date. - * Full indexing can be time-consuming. * @return IndexResult */ - public static IndexResult reindex(String repositoryName, Repository repository, - boolean fullIndex) { + public static IndexResult reindex(String repositoryName, Repository repository) { IndexResult result = new IndexResult(); if (!LuceneUtils.deleteIndex(repository)) { return result; @@ -270,101 +266,139 @@ public class LuceneUtils { } tags.get(tag.getReferencedObjectId().getName()).add(tag.displayName); } + + ObjectReader reader = repository.newObjectReader(); - // walk through each branch + // get the local branches List branches = JGitUtils.getLocalBranches(repository, true, -1); + + // sort them by most recently updated + Collections.sort(branches, new Comparator() { + @Override + public int compare(RefModel ref1, RefModel ref2) { + return ref2.getDate().compareTo(ref1.getDate()); + } + }); + + // reorder default branch to first position + RefModel defaultBranch = null; + ObjectId defaultBranchId = JGitUtils.getDefaultBranch(repository); + for (RefModel branch : branches) { + if (branch.getObjectId().equals(defaultBranchId)) { + defaultBranch = branch; + break; + } + } + branches.remove(defaultBranch); + branches.add(0, defaultBranch); + + // walk through each branch for (RefModel branch : branches) { if (excludedBranches.contains(branch.getName())) { continue; } + String branchName = branch.getName(); - RevWalk revWalk = new RevWalk(repository); - RevCommit branchHead = revWalk.parseCommit(branch.getObjectId()); - String head = branchHead.getId().getName(); + RevWalk revWalk = new RevWalk(reader); + RevCommit tip = revWalk.parseCommit(branch.getObjectId()); + String tipId = tip.getId().getName(); String keyName = getBranchKey(branchName); config.setString(CONF_ALIAS, null, keyName, branchName); - config.setString(CONF_BRANCH, null, keyName, head); + config.setString(CONF_BRANCH, null, keyName, tipId); // index the blob contents of the tree - ByteArrayOutputStream os = new ByteArrayOutputStream(); - byte[] tmp = new byte[32767]; TreeWalk treeWalk = new TreeWalk(repository); - treeWalk.addTree(branchHead.getTree()); - treeWalk.setRecursive(true); - + treeWalk.addTree(tip.getTree()); + treeWalk.setRecursive(true); + Map paths = new TreeMap(); while (treeWalk.next()) { - result.blobCount++; - String blobPath = treeWalk.getPathString(); - RevCommit blobRev = branchHead; - - RevWalk blobWalk = null; - if (fullIndex) { - // XXX this is _really_ slow, there must be a better way - // determine the most recent commit for this blob - blobWalk = new RevWalk(repository); - blobWalk.markStart(blobWalk.parseCommit(branch.getObjectId())); - TreeFilter filter = AndTreeFilter.create( - PathFilterGroup.createFromStrings(Collections.singleton(blobPath)), - TreeFilter.ANY_DIFF); - blobWalk.setTreeFilter(filter); - blobRev = blobWalk.next(); - } - - String blobAuthor = getAuthor(blobRev); - String blobCommitter = getCommitter(blobRev); - String blobDate = DateTools.timeToString(blobRev.getCommitTime() * 1000L, - Resolution.MINUTE); - - if (blobWalk != null) { - blobWalk.dispose(); - } - - Document doc = new Document(); - doc.add(new Field(FIELD_OBJECT_TYPE, ObjectType.blob.name(), Store.YES, Index.NOT_ANALYZED_NO_NORMS)); - doc.add(new Field(FIELD_REPOSITORY, repositoryName, Store.YES, Index.ANALYZED)); - doc.add(new Field(FIELD_BRANCH, branchName, Store.YES, Index.ANALYZED)); - doc.add(new Field(FIELD_OBJECT_ID, blobPath, Store.YES, Index.ANALYZED)); - doc.add(new Field(FIELD_DATE, blobDate, Store.YES, Index.NO)); - doc.add(new Field(FIELD_AUTHOR, blobAuthor, Store.YES, Index.ANALYZED)); - doc.add(new Field(FIELD_COMMITTER, blobCommitter, Store.YES, Index.ANALYZED)); + paths.put(treeWalk.getPathString(), treeWalk.getObjectId(0)); + } - // determine extension to compare to the extension - // blacklist - String ext = null; - String name = blobPath.toLowerCase(); - if (name.indexOf('.') > -1) { - ext = name.substring(name.lastIndexOf('.') + 1); - } + ByteArrayOutputStream os = new ByteArrayOutputStream(); + byte[] tmp = new byte[32767]; - if (StringUtils.isEmpty(ext) || !excludedExtensions.contains(ext)) { - // read the blob content - ObjectId entid = treeWalk.getObjectId(0); - FileMode entmode = treeWalk.getFileMode(0); - RevObject ro = revWalk.lookupAny(entid, entmode.getObjectType()); - revWalk.parseBody(ro); - ObjectLoader ldr = repository.open(ro.getId(), Constants.OBJ_BLOB); - InputStream in = ldr.openStream(); - os.reset(); - int n = 0; - while ((n = in.read(tmp)) > 0) { - os.write(tmp, 0, n); + RevWalk commitWalk = new RevWalk(reader); + commitWalk.markStart(tip); + + RevCommit commit; + while ((paths.size() > 0) && (commit = commitWalk.next()) != null) { + TreeWalk diffWalk = new TreeWalk(reader); + int parentCount = commit.getParentCount(); + switch (parentCount) { + case 0: + diffWalk.addTree(new EmptyTreeIterator()); + break; + case 1: + diffWalk.addTree(getTree(commitWalk, commit.getParent(0))); + break; + default: + // skip merge commits + continue; + } + diffWalk.addTree(getTree(commitWalk, commit)); + diffWalk.setFilter(ANY_DIFF); + diffWalk.setRecursive(true); + while ((paths.size() > 0) && diffWalk.next()) { + String path = diffWalk.getPathString(); + if (!paths.containsKey(path)) { + continue; } - in.close(); - byte[] content = os.toByteArray(); - String str = new String(content, "UTF-8"); - doc.add(new Field(FIELD_CONTENT, str, Store.NO, Index.ANALYZED)); + + // remove path from set + ObjectId blobId = paths.remove(path); + result.blobCount++; + + // index the blob metadata + String blobAuthor = getAuthor(commit); + String blobCommitter = getCommitter(commit); + String blobDate = DateTools.timeToString(commit.getCommitTime() * 1000L, + Resolution.MINUTE); + + Document doc = new Document(); + doc.add(new Field(FIELD_OBJECT_TYPE, ObjectType.blob.name(), Store.YES, Index.NOT_ANALYZED_NO_NORMS)); + doc.add(new Field(FIELD_REPOSITORY, repositoryName, Store.YES, Index.ANALYZED)); + doc.add(new Field(FIELD_BRANCH, branchName, Store.YES, Index.ANALYZED)); + doc.add(new Field(FIELD_OBJECT_ID, path, Store.YES, Index.ANALYZED)); + doc.add(new Field(FIELD_DATE, blobDate, Store.YES, Index.NO)); + doc.add(new Field(FIELD_AUTHOR, blobAuthor, Store.YES, Index.ANALYZED)); + doc.add(new Field(FIELD_COMMITTER, blobCommitter, Store.YES, Index.ANALYZED)); + + // determine extension to compare to the extension + // blacklist + String ext = null; + String name = path.toLowerCase(); + if (name.indexOf('.') > -1) { + ext = name.substring(name.lastIndexOf('.') + 1); + } + + // index the blob content + if (StringUtils.isEmpty(ext) || !excludedExtensions.contains(ext)) { + ObjectLoader ldr = repository.open(blobId, Constants.OBJ_BLOB); + InputStream in = ldr.openStream(); + int n; + while ((n = in.read(tmp)) > 0) { + os.write(tmp, 0, n); + } + in.close(); + byte[] content = os.toByteArray(); + String str = new String(content, Constants.CHARACTER_ENCODING); + doc.add(new Field(FIELD_CONTENT, str, Store.NO, Index.ANALYZED)); + os.reset(); + } + + // add the blob to the index writer.addDocument(doc); } } os.close(); - treeWalk.release(); - // index the head commit object - if (indexedCommits.add(head)) { - Document doc = createDocument(branchHead, tags.get(head)); + // index the tip commit object + if (indexedCommits.add(tipId)) { + Document doc = createDocument(tip, tags.get(tipId)); doc.add(new Field(FIELD_REPOSITORY, repositoryName, Store.YES, Index.ANALYZED)); doc.add(new Field(FIELD_BRANCH, branchName, Store.YES, Index.ANALYZED)); writer.addDocument(doc); @@ -373,10 +407,10 @@ public class LuceneUtils { } // traverse the log and index the previous commit objects - revWalk.reset(); - revWalk.markStart(branchHead); + RevWalk historyWalk = new RevWalk(reader); + historyWalk.markStart(historyWalk.parseCommit(tip.getId())); RevCommit rev; - while ((rev = revWalk.next()) != null) { + while ((rev = historyWalk.next()) != null) { String hash = rev.getId().getName(); if (indexedCommits.add(hash)) { Document doc = createDocument(rev, tags.get(hash)); @@ -386,11 +420,11 @@ public class LuceneUtils { result.commitCount += 1; } } - - // finished - revWalk.dispose(); } + // finished + reader.release(); + // this repository has a gb-issues branch, index all issues if (IssueUtils.getIssuesBranch(repository) != null) { List issues = IssueUtils.getIssues(repository, null); @@ -416,6 +450,23 @@ public class LuceneUtils { } return result; } + + /** + * Get the tree associated with the given commit. + * + * @param walk + * @param commit + * @return tree + * @throws IOException + */ + protected static RevTree getTree(final RevWalk walk, final RevCommit commit) + throws IOException { + final RevTree tree = commit.getTree(); + if (tree != null) + return tree; + walk.parseHeaders(commit); + return commit.getTree(); + } /** * Incrementally update the index with the specified commit for the @@ -639,7 +690,7 @@ public class LuceneUtils { doc.add(new Field(FIELD_ATTACHMENT, StringUtils.flattenStrings(attachments), Store.YES, Index.ANALYZED)); doc.add(new Field(FIELD_SUMMARY, issue.summary, Store.YES, Index.ANALYZED)); - doc.add(new Field(FIELD_CONTENT, issue.toString(), Store.NO, Index.ANALYZED)); + doc.add(new Field(FIELD_CONTENT, issue.toString(), Store.YES, Index.ANALYZED)); doc.add(new Field(FIELD_LABEL, StringUtils.flattenStrings(issue.getLabels()), Store.YES, Index.ANALYZED)); return doc; @@ -662,7 +713,7 @@ public class LuceneUtils { doc.add(new Field(FIELD_AUTHOR, getAuthor(commit), Store.YES, Index.ANALYZED)); doc.add(new Field(FIELD_COMMITTER, getCommitter(commit), Store.YES, Index.ANALYZED)); doc.add(new Field(FIELD_SUMMARY, commit.getShortMessage(), Store.YES, Index.ANALYZED)); - doc.add(new Field(FIELD_CONTENT, commit.getFullMessage(), Store.NO, Index.ANALYZED)); + doc.add(new Field(FIELD_CONTENT, commit.getFullMessage(), Store.YES, Index.ANALYZED)); if (!ArrayUtils.isEmpty(tags)) { doc.add(new Field(FIELD_TAG, StringUtils.flattenStrings(tags), Store.YES, Index.ANALYZED)); } @@ -696,6 +747,7 @@ public class LuceneUtils { result.score = score; result.date = DateTools.stringToDate(doc.get(FIELD_DATE)); result.summary = doc.get(FIELD_SUMMARY); + result.content = doc.get(FIELD_CONTENT); result.author = doc.get(FIELD_AUTHOR); result.committer = doc.get(FIELD_COMMITTER); result.type = ObjectType.fromName(doc.get(FIELD_OBJECT_TYPE)); diff --git a/tests/com/gitblit/tests/LuceneUtilsTest.java b/tests/com/gitblit/tests/LuceneUtilsTest.java index 3b21e712..e7754586 100644 --- a/tests/com/gitblit/tests/LuceneUtilsTest.java +++ b/tests/com/gitblit/tests/LuceneUtilsTest.java @@ -35,45 +35,35 @@ import com.gitblit.utils.StringUtils; public class LuceneUtilsTest { @Test - public void testQuickIndex() throws Exception { + public void testIndex() throws Exception { // reindex helloworld Repository repository = GitBlitSuite.getHelloworldRepository(); String name = StringUtils.getRelativePath(GitBlitSuite.REPOSITORIES.getAbsolutePath(), repository.getDirectory().getAbsolutePath()); - LuceneUtils.reindex(name, repository, false); + LuceneUtils.reindex(name, repository); + SearchResult result = LuceneUtils.search("type:blob AND id:bit.bit", 1, repository).get(0); + assertEquals("Mike Donaghy", result.author); + result = LuceneUtils.search("type:blob AND id:clipper.prg", 1, repository).get(0); + assertEquals("tinogomes", result.author); repository.close(); // reindex theoretical physics repository = GitBlitSuite.getTheoreticalPhysicsRepository(); name = StringUtils.getRelativePath(GitBlitSuite.REPOSITORIES.getAbsolutePath(), repository.getDirectory().getAbsolutePath()); - LuceneUtils.reindex(name, repository, false); + LuceneUtils.reindex(name, repository); repository.close(); // reindex JGit repository = GitBlitSuite.getJGitRepository(); name = StringUtils.getRelativePath(GitBlitSuite.REPOSITORIES.getAbsolutePath(), repository.getDirectory().getAbsolutePath()); - LuceneUtils.reindex(name, repository, false); + LuceneUtils.reindex(name, repository); repository.close(); LuceneUtils.close(); } - @Test - public void testFullIndex() throws Exception { - // reindex helloworld - Repository repository = GitBlitSuite.getHelloworldRepository(); - String name = StringUtils.getRelativePath(GitBlitSuite.REPOSITORIES.getAbsolutePath(), - repository.getDirectory().getAbsolutePath()); - LuceneUtils.reindex(name, repository, true); - SearchResult result = LuceneUtils.search("type:blob AND id:bit.bit", 1, repository).get(0); - repository.close(); - assertEquals("Mike Donaghy", result.author); - //assertEquals("Mike Donaghy", result.date); - LuceneUtils.close(); - } - @Test public void testQuery() throws Exception { // 2 occurrences on the master branch -- 2.39.5