1 package org.apache.maven.repository.indexing;
4 * Copyright 2005-2006 The Apache Software Foundation.
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
10 * http://www.apache.org/licenses/LICENSE-2.0
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
19 import org.apache.lucene.analysis.Analyzer;
20 import org.apache.lucene.analysis.CharTokenizer;
21 import org.apache.lucene.analysis.SimpleAnalyzer;
22 import org.apache.lucene.analysis.TokenStream;
23 import org.apache.lucene.index.IndexReader;
24 import org.apache.lucene.index.IndexWriter;
25 import org.apache.lucene.index.Term;
26 import org.apache.maven.artifact.repository.ArtifactRepository;
29 import java.io.IOException;
30 import java.io.Reader;
31 import java.util.Collection;
32 import java.util.zip.ZipEntry;
35 * Abstract class for RepositoryIndexers.
37 * @author Edwin Punzalan
38 * @todo [BP] overall am not happy with the design of this class and subclasses, but will refactor over time based on how it is used and by assessing how this affects Lucene's performance
40 public abstract class AbstractRepositoryIndex
41 implements RepositoryIndex
43 // TODO: can this be derived from the repository? -- probably a sensible default, but still should be configurable, but this could just be on the call to open()
44 private File indexPath;
46 private boolean indexOpen;
48 // TODO: why is the writer open for the life, but not the reader? why keep them open that length of time anyway? investigate best practices in Lucene
49 private IndexWriter indexWriter;
51 protected ArtifactRepository repository;
53 private Analyzer analyzer;
61 protected AbstractRepositoryIndex( File indexPath, ArtifactRepository repository )
63 this.repository = repository;
64 this.indexPath = indexPath;
68 * Method to open the IndexWriter
70 * @throws RepositoryIndexException
73 throws RepositoryIndexException
79 indexWriter = new IndexWriter( indexPath, getAnalyzer(), false );
83 indexWriter = new IndexWriter( indexPath, getAnalyzer(), true );
86 catch ( IOException ie )
88 throw new RepositoryIndexException( ie );
94 * @see org.apache.maven.repository.indexing.RepositoryIndex#optimize()
96 public void optimize()
97 throws RepositoryIndexException
101 throw new RepositoryIndexException( "Unable to optimize index on a closed index" );
106 indexWriter.optimize();
108 catch ( IOException ioe )
110 throw new RepositoryIndexException( "Failed to optimize index", ioe );
115 * @see org.apache.maven.repository.indexing.RepositoryIndex#isOpen()
117 public boolean isOpen()
123 * @see org.apache.maven.repository.indexing.RepositoryIndex#close()
126 throws RepositoryIndexException
130 if ( indexWriter != null )
138 catch ( IOException e )
140 throw new RepositoryIndexException( e.getMessage(), e );
145 * @see org.apache.maven.repository.indexing.RepositoryIndex#getIndexPath()
147 public File getIndexPath()
153 * Method to retrieve the lucene IndexWriter used in creating/updating the index
155 * @return the lucene IndexWriter object used to update the index
156 * @throws IOException
158 protected IndexWriter getIndexWriter()
161 // TODO: why is this allowed to be called before open()?
162 if ( indexWriter == null )
164 indexWriter = new IndexWriter( indexPath, getAnalyzer(), false );
170 * method for validating an index directory
173 * @throws RepositoryIndexException if the given indexPath is not valid for this type of RepositoryIndex
175 protected void validateIndex( String[] indexFields )
176 throws RepositoryIndexException, IOException
178 IndexReader indexReader = IndexReader.open( indexPath );
181 if ( indexReader.numDocs() > 0 )
183 Collection fields = indexReader.getFieldNames();
184 for ( int idx = 0; idx < indexFields.length; idx++ )
186 if ( !fields.contains( indexFields[idx] ) )
188 throw new RepositoryIndexException(
189 "The Field " + indexFields[idx] + " does not exist in index " + indexPath + "." );
201 * @see org.apache.maven.repository.indexing.RepositoryIndex#getRepository()
203 public ArtifactRepository getRepository()
209 * Delete the document(s) that contains the specified value on the specified field.
213 * @throws RepositoryIndexException
214 * @throws IOException
216 protected void deleteDocument( String field, String value )
217 throws RepositoryIndexException, IOException
219 IndexReader indexReader = null;
222 indexReader = IndexReader.open( indexPath );
223 indexReader.delete( new Term( field, value ) );
225 catch ( IOException ie )
227 throw new RepositoryIndexException( indexPath + " is not a valid directory." );
231 if ( indexReader != null )
239 * Check if the index already exists.
241 * @return true if the index already exists
242 * @throws RepositoryIndexException
244 protected boolean indexExists()
245 throws RepositoryIndexException
247 if ( IndexReader.indexExists( indexPath ) )
251 else if ( !indexPath.exists() )
255 else if ( indexPath.isDirectory() )
257 if ( indexPath.listFiles().length > 1 )
259 throw new RepositoryIndexException( indexPath + " is not a valid index directory." );
268 throw new RepositoryIndexException( indexPath + " is not a directory." );
273 * Checks if the object has already been indexed and deletes it if it is.
275 * @param object the object to be indexed.
276 * @throws RepositoryIndexException
277 * @throws IOException
279 abstract void deleteIfIndexed( Object object )
280 throws RepositoryIndexException, IOException;
283 * @see org.apache.maven.repository.indexing.RepositoryIndex#getAnalyzer()
285 public Analyzer getAnalyzer()
287 if ( analyzer == null )
289 analyzer = new ArtifactRepositoryIndexAnalyzer( new SimpleAnalyzer() );
296 * @see RepositoryIndex#isKeywordField(String)
298 public boolean isKeywordField( String field )
300 return KEYWORD_FIELDS.contains( field );
304 * Method to test a zip entry if it is a java class, and adds it to the classes buffer
306 * @param entry the zip entry to test for java class
307 * @param classes the String buffer to add the java class if the test result as true
308 * @return true if the zip entry is a java class and was successfully added to the buffer
310 protected boolean addIfClassEntry( ZipEntry entry, StringBuffer classes )
312 boolean isAdded = false;
314 String name = entry.getName();
315 if ( name.endsWith( ".class" ) )
317 // TODO verify if class is public or protected
318 if ( name.lastIndexOf( "$" ) == -1 )
320 int idx = name.lastIndexOf( '/' );
325 String classname = name.substring( idx + 1, name.length() - 6 );
326 classes.append( classname ).append( "\n" );
334 private static class ArtifactRepositoryIndexAnalyzer
337 private Analyzer defaultAnalyzer;
340 * constructor to for this analyzer
342 * @param defaultAnalyzer the analyzer to use as default for the general fields of the artifact indeces
344 ArtifactRepositoryIndexAnalyzer( Analyzer defaultAnalyzer )
346 this.defaultAnalyzer = defaultAnalyzer;
350 * Method called by lucence during indexing operations
352 * @param fieldName the field name that the lucene object is currently processing
353 * @param reader a Reader object to the index stream
354 * @return an analyzer to specific to the field name or the default analyzer if none is present
356 public TokenStream tokenStream( String fieldName, Reader reader )
358 TokenStream tokenStream;
360 if ( RepositoryIndex.FLD_VERSION.equals( fieldName ) || RepositoryIndex.FLD_LASTUPDATE.equals( fieldName ) )
362 tokenStream = new VersionTokenizer( reader );
366 tokenStream = defaultAnalyzer.tokenStream( fieldName, reader );
374 * Class used to tokenize an artifact's version.
376 private static class VersionTokenizer
377 extends CharTokenizer
380 * Constructor with the required reader to the index stream
382 * @param reader the Reader object of the index stream
384 VersionTokenizer( Reader reader )
390 * method that lucene calls to check tokenization of a stream character
392 * @param character char currently being processed
393 * @return true if the char is a token, false if the char is a stop char
395 protected boolean isTokenChar( char character )
397 return character != '.' && character != '-';