From d53eab3b3270799746259929d0256504e2e9a6a2 Mon Sep 17 00:00:00 2001 From: James William Dumay Date: Fri, 5 Dec 2008 06:35:48 +0000 Subject: [PATCH] Search now has good results. We removed the content indexing and actually included fields to search. Query now uses AND instead of the default OR for the QueryParser. Very googly. git-svn-id: https://svn.apache.org/repos/asf/archiva/branches/archiva-search-improvements@723612 13f79535-47bb-0310-9956-ffa450edef68 --- .../lucene/IndexContentConsumer.java | 6 --- .../filecontent/FileContentAnalyzer.java | 12 +++++ .../filecontent/FileContentConverter.java | 5 +-- .../filecontent/FileContentHandlers.java | 13 +++++- .../indexer/filecontent/FileContentKeys.java | 2 - .../filecontent/FileContentRecord.java | 12 ----- .../lucene/analyzers/ArtifactIdTokenizer.java | 45 +++++++++++++++++++ .../services/SearchServiceImplTest.java | 2 - 8 files changed, 69 insertions(+), 28 deletions(-) create mode 100644 archiva-modules/archiva-base/archiva-indexer/src/main/java/org/apache/maven/archiva/indexer/lucene/analyzers/ArtifactIdTokenizer.java diff --git a/archiva-modules/archiva-base/archiva-consumers/archiva-lucene-consumers/src/main/java/org/apache/maven/archiva/consumers/lucene/IndexContentConsumer.java b/archiva-modules/archiva-base/archiva-consumers/archiva-lucene-consumers/src/main/java/org/apache/maven/archiva/consumers/lucene/IndexContentConsumer.java index ea3908772..2c805f64c 100644 --- a/archiva-modules/archiva-base/archiva-consumers/archiva-lucene-consumers/src/main/java/org/apache/maven/archiva/consumers/lucene/IndexContentConsumer.java +++ b/archiva-modules/archiva-base/archiva-consumers/archiva-lucene-consumers/src/main/java/org/apache/maven/archiva/consumers/lucene/IndexContentConsumer.java @@ -159,10 +159,8 @@ public class IndexContentConsumer FileContentRecord record = new FileContentRecord(); try { - File file = new File( repositoryDir, path ); record.setRepositoryId( this.repository.getId() ); record.setFilename( path ); - record.setContents( FileUtils.readFileToString( file, null ) ); // Test for possible artifact reference syntax. try @@ -179,10 +177,6 @@ public class IndexContentConsumer index.modifyRecord( record ); } - catch ( IOException e ) - { - triggerConsumerError( READ_CONTENT, "Unable to read file contents: " + e.getMessage() ); - } catch ( RepositoryIndexException e ) { triggerConsumerError( INDEX_ERROR, "Unable to index file contents: " + e.getMessage() ); diff --git a/archiva-modules/archiva-base/archiva-indexer/src/main/java/org/apache/maven/archiva/indexer/filecontent/FileContentAnalyzer.java b/archiva-modules/archiva-base/archiva-indexer/src/main/java/org/apache/maven/archiva/indexer/filecontent/FileContentAnalyzer.java index 855d22591..21518080e 100644 --- a/archiva-modules/archiva-base/archiva-indexer/src/main/java/org/apache/maven/archiva/indexer/filecontent/FileContentAnalyzer.java +++ b/archiva-modules/archiva-base/archiva-indexer/src/main/java/org/apache/maven/archiva/indexer/filecontent/FileContentAnalyzer.java @@ -23,6 +23,8 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.maven.archiva.indexer.lucene.analyzers.FilenamesTokenizer; +import org.apache.maven.archiva.indexer.lucene.analyzers.ArtifactIdTokenizer; +import org.apache.maven.archiva.indexer.lucene.analyzers.GroupIdTokenizer; import java.io.Reader; @@ -42,6 +44,16 @@ public class FileContentAnalyzer extends Analyzer return new FilenamesTokenizer( reader ); } + if ( FileContentKeys.ARTIFACTID.equals( field )) + { + return new ArtifactIdTokenizer(reader); + } + + if ( FileContentKeys.GROUPID.equals( field ) ) + { + return new GroupIdTokenizer(reader); + } + return STANDARD.tokenStream( field, reader ); } } diff --git a/archiva-modules/archiva-base/archiva-indexer/src/main/java/org/apache/maven/archiva/indexer/filecontent/FileContentConverter.java b/archiva-modules/archiva-base/archiva-indexer/src/main/java/org/apache/maven/archiva/indexer/filecontent/FileContentConverter.java index ad191f673..68edf4555 100644 --- a/archiva-modules/archiva-base/archiva-indexer/src/main/java/org/apache/maven/archiva/indexer/filecontent/FileContentConverter.java +++ b/archiva-modules/archiva-base/archiva-indexer/src/main/java/org/apache/maven/archiva/indexer/filecontent/FileContentConverter.java @@ -37,7 +37,6 @@ import java.text.ParseException; public class FileContentConverter implements LuceneEntryConverter { - public Document convert( LuceneRepositoryContentRecord record ) { if ( !( record instanceof FileContentRecord ) ) @@ -62,9 +61,8 @@ public class FileContentConverter doc.addFieldTokenized( ArtifactKeys.TYPE, filecontent.getArtifact().getType() ); doc.addFieldUntokenized( ArtifactKeys.CLASSIFIER, filecontent.getArtifact().getClassifier() ); } - + doc.addFieldTokenized( FileContentKeys.FILENAME, filecontent.getFilename() ); - doc.addFieldTokenized( FileContentKeys.CONTENT, filecontent.getContents() ); return doc.getDocument(); } @@ -91,7 +89,6 @@ public class FileContentConverter // Filecontent Specifics record.setFilename( document.get( FileContentKeys.FILENAME ) ); - record.setContents( document.get( FileContentKeys.CONTENT ) ); return record; } diff --git a/archiva-modules/archiva-base/archiva-indexer/src/main/java/org/apache/maven/archiva/indexer/filecontent/FileContentHandlers.java b/archiva-modules/archiva-base/archiva-indexer/src/main/java/org/apache/maven/archiva/indexer/filecontent/FileContentHandlers.java index 70a95c9f4..f3058dda0 100644 --- a/archiva-modules/archiva-base/archiva-indexer/src/main/java/org/apache/maven/archiva/indexer/filecontent/FileContentHandlers.java +++ b/archiva-modules/archiva-base/archiva-indexer/src/main/java/org/apache/maven/archiva/indexer/filecontent/FileContentHandlers.java @@ -43,8 +43,17 @@ public class FileContentHandlers { analyzer = new FileContentAnalyzer(); converter = new FileContentConverter(); - queryParser = new MultiFieldQueryParser( new String[] { FileContentKeys.FILENAME, FileContentKeys.CONTENT }, - analyzer ); + queryParser = new MultiFieldQueryParser( new String[] { + FileContentKeys.FILENAME, + FileContentKeys.ARTIFACTID, + FileContentKeys.GROUPID, + FileContentKeys.ARTIFACTID_EXACT, + FileContentKeys.GROUPID_EXACT, + FileContentKeys.VERSION, + FileContentKeys.VERSION_EXACT}, + analyzer ); + //We prefer the narrowing approach to search results. + queryParser.setDefaultOperator(MultiFieldQueryParser.Operator.AND); } public String getId() diff --git a/archiva-modules/archiva-base/archiva-indexer/src/main/java/org/apache/maven/archiva/indexer/filecontent/FileContentKeys.java b/archiva-modules/archiva-base/archiva-indexer/src/main/java/org/apache/maven/archiva/indexer/filecontent/FileContentKeys.java index 1b9e6260c..343f359a3 100644 --- a/archiva-modules/archiva-base/archiva-indexer/src/main/java/org/apache/maven/archiva/indexer/filecontent/FileContentKeys.java +++ b/archiva-modules/archiva-base/archiva-indexer/src/main/java/org/apache/maven/archiva/indexer/filecontent/FileContentKeys.java @@ -32,6 +32,4 @@ public class FileContentKeys public static final String ID = "filecontent"; public static final String FILENAME = "filename"; - - public static final String CONTENT = "content"; } diff --git a/archiva-modules/archiva-base/archiva-indexer/src/main/java/org/apache/maven/archiva/indexer/filecontent/FileContentRecord.java b/archiva-modules/archiva-base/archiva-indexer/src/main/java/org/apache/maven/archiva/indexer/filecontent/FileContentRecord.java index 991f7b0a2..0a1221e1e 100644 --- a/archiva-modules/archiva-base/archiva-indexer/src/main/java/org/apache/maven/archiva/indexer/filecontent/FileContentRecord.java +++ b/archiva-modules/archiva-base/archiva-indexer/src/main/java/org/apache/maven/archiva/indexer/filecontent/FileContentRecord.java @@ -39,8 +39,6 @@ public class FileContentRecord */ private ArchivaArtifact artifact; - private String contents; - public String getRepositoryId() { return repositoryId; @@ -51,16 +49,6 @@ public class FileContentRecord this.repositoryId = repositoryId; } - public String getContents() - { - return contents; - } - - public void setContents( String contents ) - { - this.contents = contents; - } - public String getPrimaryKey() { return repositoryId + ":" + filename; diff --git a/archiva-modules/archiva-base/archiva-indexer/src/main/java/org/apache/maven/archiva/indexer/lucene/analyzers/ArtifactIdTokenizer.java b/archiva-modules/archiva-base/archiva-indexer/src/main/java/org/apache/maven/archiva/indexer/lucene/analyzers/ArtifactIdTokenizer.java new file mode 100644 index 000000000..2e99c2663 --- /dev/null +++ b/archiva-modules/archiva-base/archiva-indexer/src/main/java/org/apache/maven/archiva/indexer/lucene/analyzers/ArtifactIdTokenizer.java @@ -0,0 +1,45 @@ +package org.apache.maven.archiva.indexer.lucene.analyzers; + +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +import java.io.Reader; +import org.apache.lucene.analysis.CharTokenizer; + +/** + * Lucene Tokenizer for {@link ArtifactKeys#ARTIFACTID} fields. + */ +public class ArtifactIdTokenizer extends CharTokenizer +{ + public ArtifactIdTokenizer( Reader reader ) + { + super( reader ); + } + + /** + * Break on "-" for "atlassian-plugins-core" + * @param c + * @return + */ + @Override + protected boolean isTokenChar(char c) + { + return (c != '-'); + } +} diff --git a/archiva-modules/archiva-web/archiva-xmlrpc/archiva-xmlrpc-services/src/test/java/org/apache/archiva/web/xmlrpc/services/SearchServiceImplTest.java b/archiva-modules/archiva-web/archiva-xmlrpc/archiva-xmlrpc-services/src/test/java/org/apache/archiva/web/xmlrpc/services/SearchServiceImplTest.java index 2deb6f1c2..2db1b931c 100644 --- a/archiva-modules/archiva-web/archiva-xmlrpc/archiva-xmlrpc-services/src/test/java/org/apache/archiva/web/xmlrpc/services/SearchServiceImplTest.java +++ b/archiva-modules/archiva-web/archiva-xmlrpc/archiva-xmlrpc-services/src/test/java/org/apache/archiva/web/xmlrpc/services/SearchServiceImplTest.java @@ -152,7 +152,6 @@ public class SearchServiceImplTest FileContentRecord record = new FileContentRecord(); record.setRepositoryId( "repo1.mirror" ); record.setArtifact( artifact ); - record.setContents( "org.apache.archiva:archiva-test:1.0:jar org.apache.archiva.test.MyClassName" ); record.setFilename( "archiva-test-1.0.jar" ); results.addHit( record ); @@ -198,7 +197,6 @@ public class SearchServiceImplTest FileContentRecord record = new FileContentRecord(); record.setRepositoryId( "repo1.mirror" ); record.setArtifact( artifact ); - record.setContents( "org.apache.archiva:archiva-test:1.0:jar" ); record.setFilename( "archiva-test-1.0.jar" ); results.addHit( record ); -- 2.39.5