From e31da050c6ab5ece38fb18196948337395ae59e6 Mon Sep 17 00:00:00 2001 From: James Moger Date: Sun, 26 Feb 2012 17:40:48 -0500 Subject: [PATCH] Partially working Lucene executor. Needs refactoring. (issue 16) --- src/com/gitblit/GitBlit.java | 10 + src/com/gitblit/LuceneExecutor.java | 177 ++++++++++++++++ src/com/gitblit/utils/LuceneUtils.java | 209 +++++++++++++++---- tests/com/gitblit/tests/IssuesTest.java | 4 +- tests/com/gitblit/tests/LuceneUtilsTest.java | 6 +- 5 files changed, 361 insertions(+), 45 deletions(-) create mode 100644 src/com/gitblit/LuceneExecutor.java diff --git a/src/com/gitblit/GitBlit.java b/src/com/gitblit/GitBlit.java index e6f07e08..580bf628 100644 --- a/src/com/gitblit/GitBlit.java +++ b/src/com/gitblit/GitBlit.java @@ -136,6 +136,8 @@ public class GitBlit implements ServletContextListener { private MailExecutor mailExecutor; + private LuceneExecutor luceneExecutor; + private TimeZone timezone; public GitBlit() { @@ -1806,10 +1808,18 @@ public class GitBlit implements ServletContextListener { setUserService(loginService); mailExecutor = new MailExecutor(settings); if (mailExecutor.isReady()) { + logger.info("Mail executor is scheduled to process the message queue every 2 minutes."); scheduledExecutor.scheduleAtFixedRate(mailExecutor, 1, 2, TimeUnit.MINUTES); } else { logger.warn("Mail server is not properly configured. Mail services disabled."); } + luceneExecutor = new LuceneExecutor(settings); + if (luceneExecutor.isReady()) { + logger.info("Lucene executor is scheduled to process the repository queue every 10 minutes."); + scheduledExecutor.scheduleAtFixedRate(luceneExecutor, 1, 10, TimeUnit.MINUTES); + } else { + logger.warn("Lucene executor is disabled."); + } if (startFederation) { configureFederation(); } diff --git a/src/com/gitblit/LuceneExecutor.java b/src/com/gitblit/LuceneExecutor.java new file mode 100644 index 00000000..32004a73 --- /dev/null +++ b/src/com/gitblit/LuceneExecutor.java @@ -0,0 +1,177 @@ +/* + * Copyright 2012 gitblit.com. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.gitblit; + +import java.text.MessageFormat; +import java.util.HashSet; +import java.util.Queue; +import java.util.Set; +import java.util.concurrent.ConcurrentLinkedQueue; +import java.util.concurrent.atomic.AtomicBoolean; + +import org.eclipse.jgit.lib.Repository; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.gitblit.models.RepositoryModel; +import com.gitblit.utils.JGitUtils; +import com.gitblit.utils.LuceneUtils; + +/** + * The Lucene executor handles indexing repositories synchronously and + * asynchronously from a queue. + * + * @author James Moger + * + */ +public class LuceneExecutor implements Runnable { + + private final Logger logger = LoggerFactory.getLogger(LuceneExecutor.class); + + private final Queue queue = new ConcurrentLinkedQueue(); + + private final IStoredSettings settings; + + private final boolean isLuceneEnabled; + + private final boolean isPollingMode; + + private final AtomicBoolean firstRun = new AtomicBoolean(true); + + public LuceneExecutor(IStoredSettings settings) { + this.settings = settings; + this.isLuceneEnabled = settings.getBoolean("lucene.enableLucene", false); + this.isPollingMode = settings.getBoolean("lucene.pollingMode", false); + } + + /** + * Indicates if the Lucene executor can index repositories. + * + * @return true if the Lucene executor is ready to index repositories + */ + public boolean isReady() { + return isLuceneEnabled; + } + + /** + * Returns the status of the Lucene queue. + * + * @return true, if the queue is empty + */ + public boolean hasEmptyQueue() { + return queue.isEmpty(); + } + + /** + * Queues a repository to be asynchronously indexed. + * + * @param repository + * @return true if the repository was queued + */ + public boolean queue(RepositoryModel repository) { + if (!isReady()) { + return false; + } + queue.add(repository.name); + return true; + } + + @Override + public void run() { + if (!isLuceneEnabled) { + return; + } + + if (firstRun.get() || isPollingMode) { + // update all indexes on first run or if polling mode + firstRun.set(false); + queue.addAll(GitBlit.self().getRepositoryList()); + } + + Set processed = new HashSet(); + if (!queue.isEmpty()) { + // update the repository Lucene index + String repositoryName = null; + while ((repositoryName = queue.poll()) != null) { + if (processed.contains(repositoryName)) { + // skipping multi-queued repository + continue; + } + try { + Repository repository = GitBlit.self().getRepository(repositoryName); + if (repository == null) { + logger.warn(MessageFormat.format( + "Lucene executor could not find repository {0}. Skipping.", + repositoryName)); + continue; + } + index(repositoryName, repository); + repository.close(); + processed.add(repositoryName); + } catch (Throwable e) { + logger.error(MessageFormat.format("Failed to update {0} Lucene index", + repositoryName), e); + } + } + } + } + + /** + * Synchronously indexes a repository. This may build a complete index of a + * repository or it may update an existing index. + * + * @param repositoryName + * the name of the repository + * @param repository + * the repository object + */ + public void index(String repositoryName, Repository repository) { + try { + if (JGitUtils.hasCommits(repository)) { + if (LuceneUtils.shouldReindex(repository)) { + // (re)build the entire index + long start = System.currentTimeMillis(); + boolean success = LuceneUtils.reindex(repository); + long duration = System.currentTimeMillis() - start; + if (success) { + String msg = "Built {0} Lucene index in {1} msecs"; + logger.info(MessageFormat.format(msg, repositoryName, duration)); + } else { + String msg = "Could not build {0} Lucene index!"; + logger.error(MessageFormat.format(msg, repositoryName)); + } + } else { + // update the index with latest commits + long start = System.currentTimeMillis(); + boolean success = LuceneUtils.updateIndex(repository); + long duration = System.currentTimeMillis() - start; + if (success) { + String msg = "Updated {0} Lucene index in {1} msecs"; + logger.info(MessageFormat.format(msg, repositoryName, duration)); + } else { + String msg = "Could not update {0} Lucene index!"; + logger.error(MessageFormat.format(msg, repositoryName)); + } + } + } else { + logger.info(MessageFormat.format("Skipped Lucene index of empty repository {0}", + repositoryName)); + } + } catch (Throwable t) { + logger.error(MessageFormat.format("Lucene indexing failure for {0}", repositoryName), t); + } + } +} diff --git a/src/com/gitblit/utils/LuceneUtils.java b/src/com/gitblit/utils/LuceneUtils.java index 738382a4..eaf02dfb 100644 --- a/src/com/gitblit/utils/LuceneUtils.java +++ b/src/com/gitblit/utils/LuceneUtils.java @@ -7,6 +7,7 @@ import java.io.InputStream; import java.text.ParseException; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.HashMap; import java.util.LinkedHashSet; import java.util.List; @@ -47,7 +48,9 @@ import org.eclipse.jgit.lib.Repository; import org.eclipse.jgit.revwalk.RevCommit; import org.eclipse.jgit.revwalk.RevObject; import org.eclipse.jgit.revwalk.RevWalk; +import org.eclipse.jgit.storage.file.FileBasedConfig; import org.eclipse.jgit.treewalk.TreeWalk; +import org.eclipse.jgit.util.FS; import com.gitblit.models.IssueModel; import com.gitblit.models.IssueModel.Attachment; @@ -81,6 +84,7 @@ public class LuceneUtils { } private static final Version LUCENE_VERSION = Version.LUCENE_35; + private static final int INDEX_VERSION = 1; private static final String FIELD_OBJECT_TYPE = "type"; private static final String FIELD_OBJECT_ID = "id"; @@ -94,11 +98,9 @@ public class LuceneUtils { private static final String FIELD_LABEL = "label"; private static final String FIELD_ATTACHMENT = "attachment"; - private static Set excludedExtensions = new TreeSet( - Arrays.asList("7z", "arc", "arj", "bin", "bmp", "dll", "doc", - "docx", "exe", "gif", "gz", "jar", "jpg", "lib", "lzh", - "odg", "pdf", "ppt", "png", "so", "swf", "xcf", "xls", - "xlsx", "zip")); + private static Set excludedExtensions = new TreeSet(Arrays.asList("7z", "arc", + "arj", "bin", "bmp", "dll", "doc", "docx", "exe", "gif", "gz", "jar", "jpg", "lib", + "lzh", "odg", "pdf", "ppt", "png", "so", "swf", "xcf", "xls", "xlsx", "zip")); private static Set excludedBranches = new TreeSet( Arrays.asList("/refs/heads/gb-issues")); @@ -106,6 +108,12 @@ public class LuceneUtils { private static final Map SEARCHERS = new ConcurrentHashMap(); private static final Map WRITERS = new ConcurrentHashMap(); + private static final String CONF_FILE = "lucene.conf"; + private static final String CONF_INDEX = "index"; + private static final String CONF_VERSION = "version"; + private static final String CONF_ALIAS = "aliases"; + private static final String CONF_BRANCH = "branches"; + /** * Returns the name of the repository. * @@ -119,7 +127,49 @@ public class LuceneUtils { return repository.getDirectory().getParentFile().getName(); } } - + + /** + * Construct a keyname from the branch. + * + * @param branchName + * @return a keyname appropriate for the Git config file format + */ + private static String getBranchKey(String branchName) { + return StringUtils.getSHA1(branchName); + } + + /** + * Returns the Lucene configuration for the specified repository. + * + * @param repository + * @return a config object + */ + private static FileBasedConfig getConfig(Repository repository) { + File file = new File(repository.getDirectory(), CONF_FILE); + FileBasedConfig config = new FileBasedConfig(file, FS.detect()); + return config; + } + + /** + * Reads the Lucene config file for the repository to check the index + * version. If the index version is different, then rebuild the repository + * index. + * + * @param repository + * @return true of the on-disk index format is different than INDEX_VERSION + */ + public static boolean shouldReindex(Repository repository) { + try { + FileBasedConfig config = getConfig(repository); + config.load(); + int indexVersion = config.getInt(CONF_INDEX, CONF_VERSION, 0); + // reindex if versions do not match + return indexVersion != INDEX_VERSION; + } catch (Throwable t) { + } + return true; + } + /** * Deletes the Lucene index for the specified repository. * @@ -133,6 +183,10 @@ public class LuceneUtils { org.eclipse.jgit.util.FileUtils.delete(luceneIndex, org.eclipse.jgit.util.FileUtils.RECURSIVE); } + File luceneConfig = new File(repository.getDirectory(), CONF_FILE); + if (luceneConfig.exists()) { + luceneConfig.delete(); + } return true; } catch (IOException e) { throw new RuntimeException(e); @@ -146,14 +200,22 @@ public class LuceneUtils { * @param repository * @return true if the indexing has succeeded */ - public static boolean index(Repository repository) { + public static boolean reindex(Repository repository) { + if (!LuceneUtils.deleteIndex(repository)) { + return false; + } try { String repositoryName = getName(repository); + FileBasedConfig config = getConfig(repository); Set indexedCommits = new TreeSet(); IndexWriter writer = getIndexWriter(repository, true); // build a quick lookup of tags Map> tags = new HashMap>(); for (RefModel tag : JGitUtils.getTags(repository, false, -1)) { + if (!tag.isAnnotatedTag()) { + // skip non-annotated tags + continue; + } if (!tags.containsKey(tag.getObjectId())) { tags.put(tag.getReferencedObjectId().getName(), new ArrayList()); } @@ -170,6 +232,10 @@ public class LuceneUtils { RevWalk revWalk = new RevWalk(repository); RevCommit rev = revWalk.parseCommit(branch.getObjectId()); + String keyName = getBranchKey(branchName); + config.setString(CONF_ALIAS, null, keyName, branchName); + config.setString(CONF_BRANCH, null, keyName, rev.getName()); + // index the blob contents of the tree ByteArrayOutputStream os = new ByteArrayOutputStream(); byte[] tmp = new byte[32767]; @@ -184,8 +250,7 @@ public class LuceneUtils { Index.NOT_ANALYZED_NO_NORMS)); doc.add(new Field(FIELD_REPOSITORY, repositoryName, Store.YES, Index.NOT_ANALYZED)); - doc.add(new Field(FIELD_BRANCH, branchName, Store.YES, - Index.NOT_ANALYZED)); + doc.add(new Field(FIELD_BRANCH, branchName, Store.YES, Index.NOT_ANALYZED)); doc.add(new Field(FIELD_OBJECT_ID, treeWalk.getPathString(), Store.YES, Index.NOT_ANALYZED)); doc.add(new Field(FIELD_DATE, revDate, Store.YES, Index.NO)); @@ -233,8 +298,7 @@ public class LuceneUtils { Document doc = createDocument(rev, tags.get(head)); doc.add(new Field(FIELD_REPOSITORY, repositoryName, Store.YES, Index.NOT_ANALYZED)); - doc.add(new Field(FIELD_BRANCH, branchName, Store.YES, - Index.NOT_ANALYZED)); + doc.add(new Field(FIELD_BRANCH, branchName, Store.YES, Index.NOT_ANALYZED)); writer.addDocument(doc); } @@ -246,8 +310,7 @@ public class LuceneUtils { Document doc = createDocument(rev, tags.get(hash)); doc.add(new Field(FIELD_REPOSITORY, repositoryName, Store.YES, Index.NOT_ANALYZED)); - doc.add(new Field(FIELD_BRANCH, branchName, Store.YES, - Index.NOT_ANALYZED)); + doc.add(new Field(FIELD_BRANCH, branchName, Store.YES, Index.NOT_ANALYZED)); writer.addDocument(doc); } } @@ -268,6 +331,8 @@ public class LuceneUtils { } // commit all changes and reset the searcher + config.setInt(CONF_INDEX, null, CONF_VERSION, INDEX_VERSION); + config.save(); resetIndexSearcher(repository); writer.commit(); return true; @@ -288,13 +353,22 @@ public class LuceneUtils { * @return true, if successful */ public static boolean index(Repository repository, String branch, RevCommit commit) { - try { + try { if (excludedBranches.contains(branch)) { if (IssueUtils.GB_ISSUES.equals(branch)) { // index an issue String issueId = commit.getShortMessage().substring(2).trim(); IssueModel issue = IssueUtils.getIssue(repository, issueId); - return index(repository, issue, true); + if (issue == null) { + // delete the old issue from the index, if exists + IndexWriter writer = getIndexWriter(repository, false); + writer.deleteDocuments( + new Term(FIELD_OBJECT_TYPE, ObjectType.issue.name()), new Term( + FIELD_OBJECT_ID, issueId)); + writer.commit(); + return true; + } + return index(repository, issue); } return false; } @@ -306,9 +380,8 @@ public class LuceneUtils { for (PathChangeModel path : changedPaths) { // delete the indexed blob writer.deleteDocuments(new Term(FIELD_OBJECT_TYPE, ObjectType.blob.name()), - new Term(FIELD_BRANCH, branch), - new Term(FIELD_OBJECT_ID, path.path)); - + new Term(FIELD_BRANCH, branch), new Term(FIELD_OBJECT_ID, path.path)); + // re-index the blob if (!ChangeType.DELETE.equals(path.changeType)) { Document doc = new Document(); @@ -317,8 +390,7 @@ public class LuceneUtils { doc.add(new Field(FIELD_REPOSITORY, repositoryName, Store.YES, Index.NOT_ANALYZED)); doc.add(new Field(FIELD_BRANCH, branch, Store.YES, Index.NOT_ANALYZED)); - doc.add(new Field(FIELD_OBJECT_ID, path.path, Store.YES, - Index.NOT_ANALYZED)); + doc.add(new Field(FIELD_OBJECT_ID, path.path, Store.YES, Index.NOT_ANALYZED)); doc.add(new Field(FIELD_DATE, revDate, Store.YES, Index.NO)); doc.add(new Field(FIELD_AUTHOR, commit.getAuthorIdent().getName(), Store.YES, Index.NOT_ANALYZED_NO_NORMS)); @@ -336,15 +408,15 @@ public class LuceneUtils { if (StringUtils.isEmpty(ext) || !excludedExtensions.contains(ext)) { // read the blob content - String str = JGitUtils.getStringContent(repository, - commit.getTree(), path.path); + String str = JGitUtils.getStringContent(repository, commit.getTree(), + path.path); doc.add(new Field(FIELD_CONTENT, str, Store.NO, Index.ANALYZED)); writer.addDocument(doc); } } } writer.commit(); - + Document doc = createDocument(commit, null); return index(repository, doc); } catch (Exception e) { @@ -359,21 +431,17 @@ public class LuceneUtils { * * @param repository * @param issue - * @param reindex - * if true, the old index entry for this issue will be deleted. - * This is only appropriate for pre-existing/indexed issues. * @return true, if successful */ - public static boolean index(Repository repository, IssueModel issue, boolean reindex) { + public static boolean index(Repository repository, IssueModel issue) { try { + // delete the old issue from the index, if exists + IndexWriter writer = getIndexWriter(repository, false); + writer.deleteDocuments(new Term(FIELD_OBJECT_TYPE, ObjectType.issue.name()), new Term( + FIELD_OBJECT_ID, String.valueOf(issue.id))); + writer.commit(); + Document doc = createDocument(issue); - if (reindex) { - // delete the old issue from the index, if exists - IndexWriter writer = getIndexWriter(repository, false); - writer.deleteDocuments(new Term(FIELD_OBJECT_TYPE, ObjectType.issue.name()), - new Term(FIELD_OBJECT_ID, String.valueOf(issue.id))); - writer.commit(); - } return index(repository, doc); } catch (Exception e) { e.printStackTrace(); @@ -381,6 +449,68 @@ public class LuceneUtils { return false; } + /** + * Updates a repository index incrementally from the last indexed commits. + * + * @param repository + */ + public static boolean updateIndex(Repository repository) { + boolean success = false; + try { + FileBasedConfig config = getConfig(repository); + config.load(); + + // build a quick lookup of annotated tags + Map> tags = new HashMap>(); + for (RefModel tag : JGitUtils.getTags(repository, false, -1)) { + if (!tag.isAnnotatedTag()) { + // skip non-annotated tags + continue; + } + if (!tags.containsKey(tag.getObjectId())) { + tags.put(tag.getReferencedObjectId().getName(), new ArrayList()); + } + tags.get(tag.getReferencedObjectId().getName()).add(tag.displayName); + } + + List branches = JGitUtils.getLocalBranches(repository, true, -1); + // TODO detect branch deletion + + // walk through each branch + for (RefModel branch : branches) { + // determine last commit + String branchName = branch.getName(); + String keyName = getBranchKey(branchName); + String lastCommit = config.getString(CONF_BRANCH, null, keyName); + + List revs; + if (StringUtils.isEmpty(lastCommit)) { + // new branch/unindexed branch, get all commits on branch + revs = JGitUtils.getRevLog(repository, branchName, 0, -1); + } else { + // pre-existing branch, get changes since last commit + revs = JGitUtils.getRevLog(repository, lastCommit, branchName); + } + + // reverse the list of commits so we start with the first commit + Collections.reverse(revs); + for (RevCommit commit : revs) { + index(repository, branchName, commit); + } + + // update the config + config.setInt(CONF_INDEX, null, CONF_VERSION, INDEX_VERSION); + config.setString(CONF_ALIAS, null, keyName, branchName); + config.setString(CONF_BRANCH, null, keyName, branch.getObjectId().getName()); + config.save(); + } + success = true; + } catch (Throwable t) { + t.printStackTrace(); + } + return success; + } + /** * Creates a Lucene document from an issue. * @@ -446,8 +576,7 @@ public class LuceneUtils { private static boolean index(Repository repository, Document doc) { try { String repositoryName = getName(repository); - doc.add(new Field(FIELD_REPOSITORY, repositoryName, Store.YES, - Index.NOT_ANALYZED)); + doc.add(new Field(FIELD_REPOSITORY, repositoryName, Store.YES, Index.NOT_ANALYZED)); IndexWriter writer = getIndexWriter(repository, false); writer.addDocument(doc); resetIndexSearcher(repository); @@ -587,9 +716,9 @@ public class LuceneUtils { for (Repository repository : repositories) { IndexSearcher repositoryIndex = getIndexSearcher(repository); readers.add(repositoryIndex.getIndexReader()); - } - IndexReader [] rdrs = readers.toArray(new IndexReader[readers.size()]); - MultiReader reader = new MultiReader(rdrs); + } + IndexReader[] rdrs = readers.toArray(new IndexReader[readers.size()]); + MultiReader reader = new MultiReader(rdrs); searcher = new IndexSearcher(reader); } Query rewrittenQuery = searcher.rewrite(query); @@ -606,7 +735,7 @@ public class LuceneUtils { e.printStackTrace(); } return new ArrayList(results); - } + } /** * Close all the index writers and searchers diff --git a/tests/com/gitblit/tests/IssuesTest.java b/tests/com/gitblit/tests/IssuesTest.java index 1224a040..92952119 100644 --- a/tests/com/gitblit/tests/IssuesTest.java +++ b/tests/com/gitblit/tests/IssuesTest.java @@ -128,7 +128,7 @@ public class IssuesTest { // build a new Lucene index LuceneUtils.deleteIndex(repository); for (IssueModel anIssue : allIssues) { - LuceneUtils.index(repository, anIssue, false); + LuceneUtils.index(repository, anIssue); } List hits = LuceneUtils.search("working", 10, repository); assertTrue(hits.size() > 0); @@ -139,7 +139,7 @@ public class IssuesTest { change.comment("this is a test of reindexing an issue"); IssueUtils.updateIssue(repository, issue.id, change); issue = IssueUtils.getIssue(repository, issue.id); - LuceneUtils.index(repository, issue, true); + LuceneUtils.index(repository, issue); // delete all issues for (IssueModel anIssue : allIssues) { diff --git a/tests/com/gitblit/tests/LuceneUtilsTest.java b/tests/com/gitblit/tests/LuceneUtilsTest.java index 70756953..7f3dad0e 100644 --- a/tests/com/gitblit/tests/LuceneUtilsTest.java +++ b/tests/com/gitblit/tests/LuceneUtilsTest.java @@ -37,17 +37,17 @@ public class LuceneUtilsTest { public void testFullIndex() throws Exception { // reindex helloworld Repository repository = GitBlitSuite.getHelloworldRepository(); - LuceneUtils.index(repository); + LuceneUtils.reindex(repository); repository.close(); // reindex theoretical physics repository = GitBlitSuite.getTheoreticalPhysicsRepository(); - LuceneUtils.index(repository); + LuceneUtils.reindex(repository); repository.close(); // reindex JGit repository = GitBlitSuite.getJGitRepository(); - LuceneUtils.index(repository); + LuceneUtils.reindex(repository); repository.close(); LuceneUtils.close(); -- 2.39.5