Browse Source

Switched to streaming data for tika parsing

pull/1277/head
Mitja Lenič 6 years ago
parent
commit
1ae61f3ff8

+ 3
- 3
build.moxie View File

@@ -169,14 +169,14 @@ dependencies:
- compile 'com.unboundid:unboundid-ldapsdk:2.3.8' :war
- compile 'org.apache.ivy:ivy:2.2.0' :war
- compile 'com.toedter:jcalendar:1.3.2' :authority
- compile 'org.apache.commons:commons-compress:1.4.1' :war
- compile 'commons-io:commons-io:2.2' :war
- compile 'org.apache.commons:commons-compress:1.16' :war
- compile 'commons-io:commons-io:2.6' :war
- compile 'com.force.api:force-partner-api:24.0.0' :war
- compile 'org.freemarker:freemarker:2.3.22' :war
- compile 'com.github.dblock.waffle:waffle-jna:1.7.3' :war
- compile 'org.kohsuke:libpam4j:1.8' :war
- compile 'args4j:args4j:2.0.29' :war :fedclient
- compile 'commons-codec:commons-codec:1.7' :war
- compile 'commons-codec:commons-codec:1.11' :war
- compile 'redis.clients:jedis:2.6.2' :war
- compile 'ro.fortsoft.pf4j:pf4j:0.9.0' :war
- compile 'org.apache.tika:tika-core:1.17' :war

+ 45
- 35
src/main/java/com/gitblit/service/LuceneService.java View File

@@ -93,8 +93,13 @@ import com.gitblit.models.RepositoryModel;
import com.gitblit.models.SearchResult;
import com.gitblit.utils.ArrayUtils;
import com.gitblit.utils.JGitUtils;
import static com.gitblit.utils.JGitUtils.getDefaultBranch;
import com.gitblit.utils.StringUtils;
import java.io.ByteArrayInputStream;
import java.util.logging.Level;
import org.eclipse.jgit.lib.ObjectStream;
import org.eclipse.jgit.revwalk.RevBlob;
import org.eclipse.jgit.treewalk.filter.PathFilterGroup;
/**
* The Lucene service handles indexing and searching repositories.
@@ -552,41 +557,43 @@ public class LuceneService implements Runnable {
if (StringUtils.isEmpty(ext) || !excludedExtensions.contains(ext)) {
boolean useTika = useTika(ext);
ObjectLoader ldr = repository.open(blobId, Constants.OBJ_BLOB);
InputStream in = ldr.openStream();
int n;
while ((n = in.read(tmp)) > 0) {
os.write(tmp, 0, n);
}
in.close();
byte[] content = os.toByteArray();
String str;
if (useTika) {
str = TikaUtils.extractText(ext, name, content, this, path, new Indexer() {
@Override
public boolean index(String name, String content) {
try {
Document doc = new Document();
doc.add(new Field(FIELD_OBJECT_TYPE, SearchObjectType.blob.name(), StringField.TYPE_STORED));
doc.add(new Field(FIELD_BRANCH, branchName, TextField.TYPE_STORED));
doc.add(new Field(FIELD_COMMIT, commitName, TextField.TYPE_STORED));
doc.add(new Field(FIELD_PATH, name, TextField.TYPE_STORED));
doc.add(new Field(FIELD_ARCHIVE, path, TextField.TYPE_STORED));
doc.add(new Field(FIELD_DATE, blobDate, StringField.TYPE_STORED));
doc.add(new Field(FIELD_AUTHOR, blobAuthor, TextField.TYPE_STORED));
doc.add(new Field(FIELD_COMMITTER, blobCommitter, TextField.TYPE_STORED));
doc.add(new Field(FIELD_CONTENT, content, TextField.TYPE_STORED));
writer.addDocument(doc);
return true;
} catch (IOException ex) {
java.util.logging.Logger.getLogger(LuceneService.class.getName()).log(Level.SEVERE, null, ex);
return false;
try (InputStream is = ldr.openStream()) {
str = TikaUtils.extractText(ext, name, is, this, path, new Indexer() {
@Override
public boolean index(String name, String content) {
try {
Document doc = new Document();
doc.add(new Field(FIELD_OBJECT_TYPE, SearchObjectType.blob.name(), StringField.TYPE_STORED));
doc.add(new Field(FIELD_BRANCH, branchName, TextField.TYPE_STORED));
doc.add(new Field(FIELD_COMMIT, commitName, TextField.TYPE_STORED));
doc.add(new Field(FIELD_PATH, name, TextField.TYPE_STORED));
doc.add(new Field(FIELD_ARCHIVE, path, TextField.TYPE_STORED));
doc.add(new Field(FIELD_DATE, blobDate, StringField.TYPE_STORED));
doc.add(new Field(FIELD_AUTHOR, blobAuthor, TextField.TYPE_STORED));
doc.add(new Field(FIELD_COMMITTER, blobCommitter, TextField.TYPE_STORED));
doc.add(new Field(FIELD_CONTENT, content, TextField.TYPE_STORED));
writer.addDocument(doc);
return true;
} catch (IOException ex) {
java.util.logging.Logger.getLogger(LuceneService.class.getName()).log(Level.SEVERE, null, ex);
return false;
}
}
}
});
});
}
} else {
InputStream in = ldr.openStream();
int n;
while ((n = in.read(tmp)) > 0) {
os.write(tmp, 0, n);
}
in.close();
byte[] content = os.toByteArray();
str = StringUtils.decodeString(content, encodings);
}
if (str!=null) {
if (str != null) {
doc.add(new Field(FIELD_CONTENT, str, TextField.TYPE_STORED));
}
os.reset();
@@ -644,7 +651,7 @@ public class LuceneService implements Runnable {
String[] encodings = storedSettings.getStrings(Keys.web.blobEncodings).toArray(new String[0]);
return StringUtils.decodeString(content, encodings);
}
/**
* Incrementally update the index with the specified commit for the
* repository.
@@ -697,9 +704,11 @@ public class LuceneService implements Runnable {
// read the blob content
String str;
if (useTika) {
byte[] content = JGitUtils.getByteContent(repository, commit.getTree(),
path.path, true);
str = TikaUtils.extractText(ext, name, content, this, spath, new Indexer() {
RevWalk rw = new RevWalk(repository);
RevBlob blob = rw.lookupBlob(ObjectId.fromString(path.objectId));
ObjectLoader ldr = repository.open(blob.getId(), Constants.OBJ_BLOB);
try (ObjectStream is = ldr.openStream()) {
str = TikaUtils.extractText(ext, name,is , this, spath, new Indexer() {
@Override
public boolean index(String name, String content) {
try {
@@ -721,7 +730,8 @@ public class LuceneService implements Runnable {
}
}
});
}
rw.dispose();
} else {
str = JGitUtils.getStringContent(repository, commit.getTree(),
path.path, encodings);
@@ -755,7 +765,7 @@ public class LuceneService implements Runnable {
}
protected boolean useTika(String ext) {
return tikaExtensions != null && tikaExtensions.contains(ext);
return tikaExtensions != null && ext != null && tikaExtensions.contains(ext);
}
/**

+ 31
- 34
src/main/java/com/gitblit/service/TikaUtils.java View File

@@ -33,58 +33,55 @@ import org.apache.tika.exception.TikaException;
public class TikaUtils {
public static String extractText(String ext, String filename, byte[] data, LuceneService service, String path, LuceneService.Indexer indexer) {
public static String extractText(String ext, String filename, InputStream is, LuceneService service, String path, LuceneService.Indexer indexer) {
Tika tika = new Tika();
String fileType = tika.detect(filename);
try (InputStream is = new ByteArrayInputStream(data)) {
try {
Logger.getLogger(TikaUtils.class.getName()).info("Tika parsing " + filename);
if (isArchive(filename, ext)) {
return extractTextFromArchive(ext, filename, data, service,path, indexer);
return extractTextFromArchive(ext, filename, is, service, path, indexer);
}
return tika.parseToString(is);
} catch (IOException ex) {
Logger.getLogger(TikaUtils.class.getName()).log(Level.SEVERE, null, ex);
return "";
} catch (Throwable tex) {
Logger.getLogger(TikaUtils.class.getName()).log(Level.SEVERE, null, tex);
return "";
}
}
private static String extractTextFromArchive(String ext, String filename, byte[] data, LuceneService service, String path, LuceneService.Indexer indexer) {
Logger.getLogger(TikaUtils.class.getName()).info("Tika zip parsing " + filename + " " + data.length);
try (InputStream is = new ByteArrayInputStream(data)) {
try (ArchiveInputStream in = new ArchiveStreamFactory().createArchiveInputStream(ArchiveStreamFactory.ZIP, is)) {
ArchiveEntry nextEntry;
while ((nextEntry = in.getNextEntry()) != null) {
String archiveExt = null;
String name = nextEntry.getName().toLowerCase();
if (name.indexOf('.') > -1) {
archiveExt = name.substring(name.lastIndexOf('.') + 1);
}
name = filename + "/" + name;
Logger.getLogger(TikaUtils.class.getName()).info("Tika zip parsing " + name);
if (!nextEntry.isDirectory()) {
try (ByteArrayOutputStream bos = new ByteArrayOutputStream()) {
IOUtils.copy(in, bos);
bos.flush();
String result = service.getEncodedString(bos.toByteArray(), archiveExt);
if (result == null && service.useTika(ext)) {
result = extractText(archiveExt, path+"/"+nextEntry.getName(), bos.toByteArray(), service, path+"/"+nextEntry.getName(), indexer);
}
if (result!=null) {
indexer.index(path+"/"+nextEntry.getName(), result);
Logger.getLogger(TikaUtils.class.getName()).info("Tika zip extract " + name + " " + result.length());
}
private static String extractTextFromArchive(String ext, String filename, InputStream is, LuceneService service, String path, LuceneService.Indexer indexer) {
Logger.getLogger(TikaUtils.class.getName()).info("Tika zip parsing " + filename + " ");
try (ArchiveInputStream in = new ArchiveStreamFactory().createArchiveInputStream(ArchiveStreamFactory.ZIP, is)) {
ArchiveEntry nextEntry;
while ((nextEntry = in.getNextEntry()) != null) {
String archiveExt = null;
String name = nextEntry.getName().toLowerCase();
if (name.indexOf('.') > -1) {
archiveExt = name.substring(name.lastIndexOf('.') + 1);
}
name = filename + "/" + name;
Logger.getLogger(TikaUtils.class.getName()).info("Tika zip parsing " + name);
if (!nextEntry.isDirectory()) {
try (ByteArrayOutputStream bos = new ByteArrayOutputStream()) {
IOUtils.copy(in, bos);
bos.flush();
String result = service.getEncodedString(bos.toByteArray(), archiveExt);
if (result == null && service.useTika(ext)) {
result = extractText(archiveExt, path + "/" + nextEntry.getName(), new ByteArrayInputStream(bos.toByteArray()), service, path + "/" + nextEntry.getName(), indexer);
}
if (result != null) {
indexer.index(path + "/" + nextEntry.getName(), result);
Logger.getLogger(TikaUtils.class.getName()).info("Tika zip extract " + name + " " + result.length());
}
} catch (IOException ex) {
Logger.getLogger(TikaUtils.class.getName()).log(Level.SEVERE, null, ex);
}
}
} catch (ArchiveException ex) {
Logger.getLogger(TikaUtils.class.getName()).log(Level.SEVERE, null, ex);
}
} catch (IOException ex) {
Logger.getLogger(TikaUtils.class.getName()).log(Level.SEVERE, null, ex);
} catch (ArchiveException ex) {
Logger.getLogger(TikaUtils.class.getName()).log(Level.SEVERE, null, ex);
}
return null;
}

Loading…
Cancel
Save