]> source.dussan.org Git - gitblit.git/commitdiff
Revised reindex method per JGit team recommendation
authorJames Moger <james.moger@gitblit.com>
Tue, 13 Mar 2012 00:52:34 +0000 (20:52 -0400)
committerJames Moger <james.moger@gitblit.com>
Tue, 13 Mar 2012 00:52:34 +0000 (20:52 -0400)
src/com/gitblit/LuceneExecutor.java
src/com/gitblit/models/SearchResult.java
src/com/gitblit/utils/LuceneUtils.java
tests/com/gitblit/tests/LuceneUtilsTest.java

index 4f06b4ec13a3bd26ec1ac4780eaf6fe6e576dfa6..c9e4c73e35b626b7eb2e3a41b3e8bb81108f3622 100644 (file)
@@ -121,6 +121,7 @@ public class LuceneExecutor implements Runnable {
                                        }\r
                                        index(name, repository);\r
                                        repository.close();\r
+                                       System.gc();\r
                                        processed.add(name);\r
                                } catch (Throwable e) {\r
                                        logger.error(MessageFormat.format("Failed to update {0} Lucene index",\r
@@ -145,18 +146,16 @@ public class LuceneExecutor implements Runnable {
                                if (LuceneUtils.shouldReindex(repository)) {\r
                                        // (re)build the entire index\r
                                        long start = System.currentTimeMillis();\r
-                                       String msg = "Building {0} Lucene index...";\r
-                                       logger.info(MessageFormat.format(msg, name));\r
-                                       IndexResult result = LuceneUtils.reindex(name, repository, true);\r
+                                       IndexResult result = LuceneUtils.reindex(name, repository);\r
                                        float duration = (System.currentTimeMillis() - start)/1000f;\r
                                        if (result.success) {\r
                                                if (result.commitCount > 0) {\r
-                                                       msg = "Built {0} Lucene index from {1} commits and {2} files across {3} branches in {4} secs";\r
+                                                       String msg = "Built {0} Lucene index from {1} commits and {2} files across {3} branches in {4} secs";\r
                                                        logger.info(MessageFormat.format(msg, name,\r
                                                                        result.commitCount, result.blobCount, result.branchCount, duration));\r
                                                }\r
                                        } else {\r
-                                               msg = "Could not build {0} Lucene index!";\r
+                                               String msg = "Could not build {0} Lucene index!";\r
                                                logger.error(MessageFormat.format(msg, name));\r
                                        }\r
                                } else {\r
index ffe4d870e15b8f143e8c21be9d96702e61912826..c74229a99488fb52dcb3b80e2f8a13a86efe33f1 100644 (file)
@@ -26,6 +26,8 @@ public class SearchResult implements Serializable {
 \r
        public String summary;\r
        \r
+       public String content;\r
+       \r
        public String repository;\r
        \r
        public String branch;\r
index 3b3c30e1244169bbdd685b2fa349f41ab4a0a3b7..3c2606bcc6c5e26aa984fb72c7183e04cd3a823a 100644 (file)
@@ -15,6 +15,8 @@
  */\r
 package com.gitblit.utils;\r
 \r
+import static org.eclipse.jgit.treewalk.filter.TreeFilter.ANY_DIFF;\r
+\r
 import java.io.ByteArrayOutputStream;\r
 import java.io.File;\r
 import java.io.IOException;\r
@@ -23,11 +25,13 @@ import java.text.ParseException;
 import java.util.ArrayList;\r
 import java.util.Arrays;\r
 import java.util.Collections;\r
+import java.util.Comparator;\r
 import java.util.HashMap;\r
 import java.util.LinkedHashSet;\r
 import java.util.List;\r
 import java.util.Map;\r
 import java.util.Set;\r
+import java.util.TreeMap;\r
 import java.util.TreeSet;\r
 import java.util.concurrent.ConcurrentHashMap;\r
 \r
@@ -56,18 +60,16 @@ import org.apache.lucene.store.FSDirectory;
 import org.apache.lucene.util.Version;\r
 import org.eclipse.jgit.diff.DiffEntry.ChangeType;\r
 import org.eclipse.jgit.lib.Constants;\r
-import org.eclipse.jgit.lib.FileMode;\r
 import org.eclipse.jgit.lib.ObjectId;\r
 import org.eclipse.jgit.lib.ObjectLoader;\r
+import org.eclipse.jgit.lib.ObjectReader;\r
 import org.eclipse.jgit.lib.Repository;\r
 import org.eclipse.jgit.revwalk.RevCommit;\r
-import org.eclipse.jgit.revwalk.RevObject;\r
+import org.eclipse.jgit.revwalk.RevTree;\r
 import org.eclipse.jgit.revwalk.RevWalk;\r
 import org.eclipse.jgit.storage.file.FileBasedConfig;\r
+import org.eclipse.jgit.treewalk.EmptyTreeIterator;\r
 import org.eclipse.jgit.treewalk.TreeWalk;\r
-import org.eclipse.jgit.treewalk.filter.AndTreeFilter;\r
-import org.eclipse.jgit.treewalk.filter.PathFilterGroup;\r
-import org.eclipse.jgit.treewalk.filter.TreeFilter;\r
 import org.eclipse.jgit.util.FS;\r
 \r
 import com.gitblit.models.IssueModel;\r
@@ -241,15 +243,9 @@ public class LuceneUtils {
         * \r
         * @param repositoryName\r
         * @param repository\r
-        * @param fullIndex\r
-        *            If false blob metadata is set to the HEAD revision of each\r
-        *            branch.  If true, each the last commit of each blob is\r
-        *            determined to properly index the author, committer, and date.\r
-        *            Full indexing can be time-consuming.\r
         * @return IndexResult\r
         */\r
-       public static IndexResult reindex(String repositoryName, Repository repository,\r
-                       boolean fullIndex) {\r
+       public static IndexResult reindex(String repositoryName, Repository repository) {\r
                IndexResult result = new IndexResult();\r
                if (!LuceneUtils.deleteIndex(repository)) {\r
                        return result;\r
@@ -270,101 +266,139 @@ public class LuceneUtils {
                                }\r
                                tags.get(tag.getReferencedObjectId().getName()).add(tag.displayName);\r
                        }\r
+                       \r
+                       ObjectReader reader = repository.newObjectReader();\r
 \r
-                       // walk through each branch\r
+                       // get the local branches\r
                        List<RefModel> branches = JGitUtils.getLocalBranches(repository, true, -1);\r
+                       \r
+                       // sort them by most recently updated\r
+                       Collections.sort(branches, new Comparator<RefModel>() {\r
+                               @Override\r
+                               public int compare(RefModel ref1, RefModel ref2) {\r
+                                       return ref2.getDate().compareTo(ref1.getDate());\r
+                               }\r
+                       });\r
+                       \r
+                       // reorder default branch to first position\r
+                       RefModel defaultBranch = null;\r
+                       ObjectId defaultBranchId = JGitUtils.getDefaultBranch(repository);\r
+                       for (RefModel branch :  branches) {\r
+                               if (branch.getObjectId().equals(defaultBranchId)) {\r
+                                       defaultBranch = branch;                                 \r
+                                       break;\r
+                               }\r
+                       }\r
+                       branches.remove(defaultBranch);\r
+                       branches.add(0, defaultBranch);\r
+                       \r
+                       // walk through each branch\r
                        for (RefModel branch : branches) {\r
                                if (excludedBranches.contains(branch.getName())) {\r
                                        continue;\r
                                }\r
+\r
                                String branchName = branch.getName();\r
-                               RevWalk revWalk = new RevWalk(repository);\r
-                               RevCommit branchHead = revWalk.parseCommit(branch.getObjectId());\r
-                               String head = branchHead.getId().getName();\r
+                               RevWalk revWalk = new RevWalk(reader);\r
+                               RevCommit tip = revWalk.parseCommit(branch.getObjectId());\r
+                               String tipId = tip.getId().getName();\r
 \r
                                String keyName = getBranchKey(branchName);\r
                                config.setString(CONF_ALIAS, null, keyName, branchName);\r
-                               config.setString(CONF_BRANCH, null, keyName, head);\r
+                               config.setString(CONF_BRANCH, null, keyName, tipId);\r
 \r
                                // index the blob contents of the tree\r
-                               ByteArrayOutputStream os = new ByteArrayOutputStream();\r
-                               byte[] tmp = new byte[32767];\r
                                TreeWalk treeWalk = new TreeWalk(repository);\r
-                               treeWalk.addTree(branchHead.getTree());\r
-                               treeWalk.setRecursive(true);\r
-                                                               \r
+                               treeWalk.addTree(tip.getTree());\r
+                               treeWalk.setRecursive(true);                                                            \r
                                \r
+                               Map<String, ObjectId> paths = new TreeMap<String, ObjectId>();\r
                                while (treeWalk.next()) {\r
-                                       result.blobCount++;\r
-                                       String blobPath = treeWalk.getPathString();\r
-                                       RevCommit blobRev = branchHead;\r
-                               \r
-                                       RevWalk blobWalk = null;\r
-                                       if (fullIndex) {\r
-                                               // XXX this is _really_ slow, there must be a better way\r
-                                               // determine the most recent commit for this blob\r
-                                               blobWalk = new RevWalk(repository);\r
-                                               blobWalk.markStart(blobWalk.parseCommit(branch.getObjectId()));\r
-                                               TreeFilter filter = AndTreeFilter.create(\r
-                                                               PathFilterGroup.createFromStrings(Collections.singleton(blobPath)),\r
-                                                               TreeFilter.ANY_DIFF);\r
-                                               blobWalk.setTreeFilter(filter);\r
-                                               blobRev = blobWalk.next();\r
-                                       }\r
-                                       \r
-                                       String blobAuthor = getAuthor(blobRev);\r
-                                       String blobCommitter = getCommitter(blobRev);\r
-                                       String blobDate = DateTools.timeToString(blobRev.getCommitTime() * 1000L,\r
-                                                       Resolution.MINUTE);\r
-                                       \r
-                                       if (blobWalk != null) {\r
-                                               blobWalk.dispose();                                             \r
-                                       }\r
-                                       \r
-                                       Document doc = new Document();\r
-                                       doc.add(new Field(FIELD_OBJECT_TYPE, ObjectType.blob.name(), Store.YES, Index.NOT_ANALYZED_NO_NORMS));\r
-                                       doc.add(new Field(FIELD_REPOSITORY, repositoryName, Store.YES, Index.ANALYZED));\r
-                                       doc.add(new Field(FIELD_BRANCH, branchName, Store.YES, Index.ANALYZED));\r
-                                       doc.add(new Field(FIELD_OBJECT_ID, blobPath, Store.YES, Index.ANALYZED));\r
-                                       doc.add(new Field(FIELD_DATE, blobDate, Store.YES, Index.NO));\r
-                                       doc.add(new Field(FIELD_AUTHOR, blobAuthor, Store.YES, Index.ANALYZED));\r
-                                       doc.add(new Field(FIELD_COMMITTER, blobCommitter, Store.YES, Index.ANALYZED));                                  \r
+                                       paths.put(treeWalk.getPathString(), treeWalk.getObjectId(0));\r
+                               }                               \r
 \r
-                                       // determine extension to compare to the extension\r
-                                       // blacklist\r
-                                       String ext = null;\r
-                                       String name = blobPath.toLowerCase();\r
-                                       if (name.indexOf('.') > -1) {\r
-                                               ext = name.substring(name.lastIndexOf('.') + 1);\r
-                                       }\r
+                               ByteArrayOutputStream os = new ByteArrayOutputStream();\r
+                               byte[] tmp = new byte[32767];\r
 \r
-                                       if (StringUtils.isEmpty(ext) || !excludedExtensions.contains(ext)) {\r
-                                               // read the blob content\r
-                                               ObjectId entid = treeWalk.getObjectId(0);\r
-                                               FileMode entmode = treeWalk.getFileMode(0);\r
-                                               RevObject ro = revWalk.lookupAny(entid, entmode.getObjectType());\r
-                                               revWalk.parseBody(ro);\r
-                                               ObjectLoader ldr = repository.open(ro.getId(), Constants.OBJ_BLOB);\r
-                                               InputStream in = ldr.openStream();\r
-                                               os.reset();\r
-                                               int n = 0;\r
-                                               while ((n = in.read(tmp)) > 0) {\r
-                                                       os.write(tmp, 0, n);\r
+                               RevWalk commitWalk = new RevWalk(reader);\r
+                               commitWalk.markStart(tip);\r
+                               \r
+                               RevCommit commit;\r
+                               while ((paths.size() > 0) && (commit = commitWalk.next()) != null) {\r
+                                       TreeWalk diffWalk = new TreeWalk(reader);\r
+                                       int parentCount = commit.getParentCount();\r
+                                       switch (parentCount) {\r
+                                       case 0:\r
+                                               diffWalk.addTree(new EmptyTreeIterator());\r
+                                               break;\r
+                                       case 1:\r
+                                               diffWalk.addTree(getTree(commitWalk, commit.getParent(0)));\r
+                                               break;\r
+                                       default:\r
+                                               // skip merge commits\r
+                                               continue;\r
+                                       }\r
+                                       diffWalk.addTree(getTree(commitWalk, commit));\r
+                                       diffWalk.setFilter(ANY_DIFF);\r
+                                       diffWalk.setRecursive(true);\r
+                                       while ((paths.size() > 0) && diffWalk.next()) {\r
+                                               String path = diffWalk.getPathString();\r
+                                               if (!paths.containsKey(path)) {\r
+                                                       continue;\r
                                                }\r
-                                               in.close();\r
-                                               byte[] content = os.toByteArray();\r
-                                               String str = new String(content, "UTF-8");\r
-                                               doc.add(new Field(FIELD_CONTENT, str, Store.NO, Index.ANALYZED));\r
+                                               \r
+                                               // remove path from set\r
+                                               ObjectId blobId = paths.remove(path);\r
+                                               result.blobCount++;\r
+                                               \r
+                                               // index the blob metadata\r
+                                               String blobAuthor = getAuthor(commit);\r
+                                               String blobCommitter = getCommitter(commit);\r
+                                               String blobDate = DateTools.timeToString(commit.getCommitTime() * 1000L,\r
+                                                               Resolution.MINUTE);\r
+                                               \r
+                                               Document doc = new Document();\r
+                                               doc.add(new Field(FIELD_OBJECT_TYPE, ObjectType.blob.name(), Store.YES, Index.NOT_ANALYZED_NO_NORMS));\r
+                                               doc.add(new Field(FIELD_REPOSITORY, repositoryName, Store.YES, Index.ANALYZED));\r
+                                               doc.add(new Field(FIELD_BRANCH, branchName, Store.YES, Index.ANALYZED));\r
+                                               doc.add(new Field(FIELD_OBJECT_ID, path, Store.YES, Index.ANALYZED));\r
+                                               doc.add(new Field(FIELD_DATE, blobDate, Store.YES, Index.NO));\r
+                                               doc.add(new Field(FIELD_AUTHOR, blobAuthor, Store.YES, Index.ANALYZED));\r
+                                               doc.add(new Field(FIELD_COMMITTER, blobCommitter, Store.YES, Index.ANALYZED));                                  \r
+\r
+                                               // determine extension to compare to the extension\r
+                                               // blacklist\r
+                                               String ext = null;\r
+                                               String name = path.toLowerCase();\r
+                                               if (name.indexOf('.') > -1) {\r
+                                                       ext = name.substring(name.lastIndexOf('.') + 1);\r
+                                               }\r
+\r
+                                               // index the blob content\r
+                                               if (StringUtils.isEmpty(ext) || !excludedExtensions.contains(ext)) {                                                    \r
+                                                       ObjectLoader ldr = repository.open(blobId, Constants.OBJ_BLOB);\r
+                                                       InputStream in = ldr.openStream();                                                      \r
+                                                       int n;\r
+                                                       while ((n = in.read(tmp)) > 0) {\r
+                                                               os.write(tmp, 0, n);\r
+                                                       }\r
+                                                       in.close();\r
+                                                       byte[] content = os.toByteArray();\r
+                                                       String str = new String(content, Constants.CHARACTER_ENCODING);\r
+                                                       doc.add(new Field(FIELD_CONTENT, str, Store.NO, Index.ANALYZED));\r
+                                                       os.reset();\r
+                                               }                                                       \r
+                                               \r
+                                               // add the blob to the index\r
                                                writer.addDocument(doc);\r
                                        }\r
                                }\r
 \r
                                os.close();\r
-                               treeWalk.release();\r
 \r
-                               // index the head commit object\r
-                               if (indexedCommits.add(head)) {\r
-                                       Document doc = createDocument(branchHead, tags.get(head));\r
+                               // index the tip commit object\r
+                               if (indexedCommits.add(tipId)) {\r
+                                       Document doc = createDocument(tip, tags.get(tipId));\r
                                        doc.add(new Field(FIELD_REPOSITORY, repositoryName, Store.YES, Index.ANALYZED));\r
                                        doc.add(new Field(FIELD_BRANCH, branchName, Store.YES, Index.ANALYZED));\r
                                        writer.addDocument(doc);\r
@@ -373,10 +407,10 @@ public class LuceneUtils {
                                }\r
 \r
                                // traverse the log and index the previous commit objects\r
-                               revWalk.reset();\r
-                               revWalk.markStart(branchHead);\r
+                               RevWalk historyWalk = new RevWalk(reader);\r
+                               historyWalk.markStart(historyWalk.parseCommit(tip.getId()));\r
                                RevCommit rev;\r
-                               while ((rev = revWalk.next()) != null) {\r
+                               while ((rev = historyWalk.next()) != null) {\r
                                        String hash = rev.getId().getName();\r
                                        if (indexedCommits.add(hash)) {\r
                                                Document doc = createDocument(rev, tags.get(hash));\r
@@ -386,11 +420,11 @@ public class LuceneUtils {
                                                result.commitCount += 1;\r
                                        }\r
                                }\r
-\r
-                               // finished\r
-                               revWalk.dispose();\r
                        }\r
 \r
+                       // finished\r
+                       reader.release();\r
+                       \r
                        // this repository has a gb-issues branch, index all issues\r
                        if (IssueUtils.getIssuesBranch(repository) != null) {\r
                                List<IssueModel> issues = IssueUtils.getIssues(repository, null);\r
@@ -416,6 +450,23 @@ public class LuceneUtils {
                }\r
                return result;\r
        }\r
+       \r
+       /**\r
+        * Get the tree associated with the given commit.\r
+        *\r
+        * @param walk\r
+        * @param commit\r
+        * @return tree\r
+        * @throws IOException\r
+        */\r
+       protected static RevTree getTree(final RevWalk walk, final RevCommit commit)\r
+                       throws IOException {\r
+               final RevTree tree = commit.getTree();\r
+               if (tree != null)\r
+                       return tree;\r
+               walk.parseHeaders(commit);\r
+               return commit.getTree();\r
+       }\r
 \r
        /**\r
         * Incrementally update the index with the specified commit for the\r
@@ -639,7 +690,7 @@ public class LuceneUtils {
                doc.add(new Field(FIELD_ATTACHMENT, StringUtils.flattenStrings(attachments), Store.YES,\r
                                Index.ANALYZED));\r
                doc.add(new Field(FIELD_SUMMARY, issue.summary, Store.YES, Index.ANALYZED));\r
-               doc.add(new Field(FIELD_CONTENT, issue.toString(), Store.NO, Index.ANALYZED));\r
+               doc.add(new Field(FIELD_CONTENT, issue.toString(), Store.YES, Index.ANALYZED));\r
                doc.add(new Field(FIELD_LABEL, StringUtils.flattenStrings(issue.getLabels()), Store.YES,\r
                                Index.ANALYZED));\r
                return doc;\r
@@ -662,7 +713,7 @@ public class LuceneUtils {
                doc.add(new Field(FIELD_AUTHOR, getAuthor(commit), Store.YES, Index.ANALYZED));\r
                doc.add(new Field(FIELD_COMMITTER, getCommitter(commit), Store.YES, Index.ANALYZED));\r
                doc.add(new Field(FIELD_SUMMARY, commit.getShortMessage(), Store.YES, Index.ANALYZED));\r
-               doc.add(new Field(FIELD_CONTENT, commit.getFullMessage(), Store.NO, Index.ANALYZED));\r
+               doc.add(new Field(FIELD_CONTENT, commit.getFullMessage(), Store.YES, Index.ANALYZED));\r
                if (!ArrayUtils.isEmpty(tags)) {\r
                        doc.add(new Field(FIELD_TAG, StringUtils.flattenStrings(tags), Store.YES, Index.ANALYZED));\r
                }\r
@@ -696,6 +747,7 @@ public class LuceneUtils {
                result.score = score;\r
                result.date = DateTools.stringToDate(doc.get(FIELD_DATE));\r
                result.summary = doc.get(FIELD_SUMMARY);\r
+               result.content = doc.get(FIELD_CONTENT);\r
                result.author = doc.get(FIELD_AUTHOR);\r
                result.committer = doc.get(FIELD_COMMITTER);\r
                result.type = ObjectType.fromName(doc.get(FIELD_OBJECT_TYPE));\r
index 3b21e712439fe168c813ed9318a7ce87e4e8a550..e77545865f846c7f9557b308dae6c14de15750db 100644 (file)
@@ -35,45 +35,35 @@ import com.gitblit.utils.StringUtils;
 public class LuceneUtilsTest {\r
 \r
        @Test\r
-       public void testQuickIndex() throws Exception {\r
+       public void testIndex() throws Exception {\r
                // reindex helloworld\r
                Repository repository = GitBlitSuite.getHelloworldRepository();\r
                String name = StringUtils.getRelativePath(GitBlitSuite.REPOSITORIES.getAbsolutePath(),\r
                                repository.getDirectory().getAbsolutePath());\r
-               LuceneUtils.reindex(name, repository, false);\r
+               LuceneUtils.reindex(name, repository);\r
+               SearchResult result = LuceneUtils.search("type:blob AND id:bit.bit", 1, repository).get(0);             \r
+               assertEquals("Mike Donaghy", result.author);\r
+               result = LuceneUtils.search("type:blob AND id:clipper.prg", 1, repository).get(0);              \r
+               assertEquals("tinogomes", result.author);\r
                repository.close();\r
 \r
                // reindex theoretical physics\r
                repository = GitBlitSuite.getTheoreticalPhysicsRepository();\r
                name = StringUtils.getRelativePath(GitBlitSuite.REPOSITORIES.getAbsolutePath(),\r
                                repository.getDirectory().getAbsolutePath());\r
-               LuceneUtils.reindex(name, repository, false);\r
+               LuceneUtils.reindex(name, repository);\r
                repository.close();\r
                \r
                // reindex JGit\r
                repository = GitBlitSuite.getJGitRepository();\r
                name = StringUtils.getRelativePath(GitBlitSuite.REPOSITORIES.getAbsolutePath(),\r
                                repository.getDirectory().getAbsolutePath());\r
-               LuceneUtils.reindex(name, repository, false);\r
+               LuceneUtils.reindex(name, repository);\r
                repository.close();\r
                \r
                LuceneUtils.close();\r
        }\r
 \r
-       @Test\r
-       public void testFullIndex() throws Exception {\r
-               // reindex helloworld\r
-               Repository repository = GitBlitSuite.getHelloworldRepository();\r
-               String name = StringUtils.getRelativePath(GitBlitSuite.REPOSITORIES.getAbsolutePath(),\r
-                               repository.getDirectory().getAbsolutePath());\r
-               LuceneUtils.reindex(name, repository, true);\r
-               SearchResult result = LuceneUtils.search("type:blob AND id:bit.bit", 1, repository).get(0);             \r
-               repository.close();\r
-               assertEquals("Mike Donaghy", result.author);\r
-               //assertEquals("Mike Donaghy", result.date);\r
-               LuceneUtils.close();\r
-       }\r
-\r
        @Test\r
        public void testQuery() throws Exception {\r
                // 2 occurrences on the master branch\r