You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

TikaUtils.java 4.3KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293
  1. /*
  2. * Copyright 2012 gitblit.com.
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. package com.gitblit.service;
  17. import java.io.ByteArrayInputStream;
  18. import java.io.ByteArrayOutputStream;
  19. import java.io.IOException;
  20. import java.io.InputStream;
  21. import java.util.Set;
  22. import java.util.logging.Level;
  23. import java.util.logging.Logger;
  24. import org.apache.commons.compress.archivers.ArchiveEntry;
  25. import org.apache.commons.compress.archivers.ArchiveException;
  26. import org.apache.commons.compress.archivers.ArchiveInputStream;
  27. import org.apache.commons.compress.archivers.ArchiveStreamFactory;
  28. import org.apache.commons.compress.archivers.zip.ZipUtil;
  29. import org.apache.commons.io.IOUtils;
  30. import org.apache.tika.Tika;
  31. import org.apache.tika.exception.TikaException;
  32. public class TikaUtils {
  33. public static String extractText(String ext, String filename, InputStream is, LuceneService service, String path, LuceneService.Indexer indexer) {
  34. Tika tika = new Tika();
  35. String fileType = tika.detect(filename);
  36. try {
  37. Logger.getLogger(TikaUtils.class.getName()).info("Tika parsing " + filename);
  38. if (isArchive(filename, ext)) {
  39. return extractTextFromArchive(ext, filename, is, service, path, indexer);
  40. }
  41. return tika.parseToString(is);
  42. } catch (Throwable tex) {
  43. Logger.getLogger(TikaUtils.class.getName()).log(Level.SEVERE, null, tex);
  44. return "";
  45. }
  46. }
  47. private static String extractTextFromArchive(String ext, String filename, InputStream is, LuceneService service, String path, LuceneService.Indexer indexer) {
  48. Logger.getLogger(TikaUtils.class.getName()).info("Tika zip parsing " + filename + " ");
  49. try (ArchiveInputStream in = new ArchiveStreamFactory().createArchiveInputStream(ArchiveStreamFactory.ZIP, is)) {
  50. ArchiveEntry nextEntry;
  51. while ((nextEntry = in.getNextEntry()) != null) {
  52. String archiveExt = null;
  53. String name = nextEntry.getName().toLowerCase();
  54. if (name.indexOf('.') > -1) {
  55. archiveExt = name.substring(name.lastIndexOf('.') + 1);
  56. }
  57. name = filename + "/" + name;
  58. Logger.getLogger(TikaUtils.class.getName()).info("Tika zip parsing " + name);
  59. if (!nextEntry.isDirectory()) {
  60. try (ByteArrayOutputStream bos = new ByteArrayOutputStream()) {
  61. IOUtils.copy(in, bos);
  62. bos.flush();
  63. String result = service.getEncodedString(bos.toByteArray(), archiveExt);
  64. if (result == null && service.useTika(ext)) {
  65. result = extractText(archiveExt, path + "/" + nextEntry.getName(), new ByteArrayInputStream(bos.toByteArray()), service, path + "/" + nextEntry.getName(), indexer);
  66. }
  67. if (result != null) {
  68. indexer.index(path + "/" + nextEntry.getName(), result);
  69. Logger.getLogger(TikaUtils.class.getName()).info("Tika zip extract " + name + " " + result.length());
  70. }
  71. } catch (IOException ex) {
  72. Logger.getLogger(TikaUtils.class.getName()).log(Level.SEVERE, null, ex);
  73. }
  74. }
  75. }
  76. } catch (IOException ex) {
  77. Logger.getLogger(TikaUtils.class.getName()).log(Level.SEVERE, null, ex);
  78. } catch (ArchiveException ex) {
  79. Logger.getLogger(TikaUtils.class.getName()).log(Level.SEVERE, null, ex);
  80. }
  81. return null;
  82. }
  83. private static boolean isArchive(String filename, String ext) {
  84. return "zip".equals(ext);
  85. }
  86. }