/************************************************************************** /* A factory for creating Lucene Documents from pdf-files. /* /* This code is derived from code contained in PDFBox. See the file /* COPYING.PDFBox in the toplevel-directory of the distribution for /* copyright information. /* /* Copyright (c) 2003-2004 by Bernhard Bablok (mail@bablokb.de) /* /* This library is free software; you can redistribute it and/or modify /* it under the terms of the GNU Lesser General Public License as published /* by the Free Software Foundation; either version 2 of the License or /* (at your option) any later version. /* /* This library is distributed in the hope that it will be useful, but /* WITHOUT ANY WARRANTY; without even the implied warranty of /* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the /* GNU Lesser General Public License for more details. /* /* You should have received a copy of the GNU Lesser General Public License /* along with this library; see the file COPYING.LESSER. If not, write to /* the Free Software Foundation Inc., 59 Temple Place - Suite 330, /* Boston, MA 02111-1307 USA /**************************************************************************/ package de.bablokb.luala.lib; import java.io.*; import java.util.*; import org.apache.lucene.document.*; import org.pdfbox.pdfparser.*; import org.pdfbox.pdmodel.*; import org.pdfbox.encryption.*; import org.pdfbox.exceptions.*; import org.pdfbox.util.*; /** A factory for creating Lucene Documents from pdf-files. This factory uses code from PDFBox. @version $Revision: 1.2 $ @author $Author: bablokb $ */ public class PDFDocumentFactory extends StandardDocumentFactory { //////////////////////////////////////////////////////////////////////////// /** Create a Lucene-Document with the extracted pdf-text as a TEXT-field. @param name The name of the source-document @return A {@link org.apache.lucene.document.Document} @throws FactoryException */ public Document createDocument(String name) throws FactoryException { PDDocument pdfDoc = null; try { Document doc = super.createDocument(name); pdfDoc = getPDFDocument(name); addFields(doc,pdfDoc); Reader reader = getReader(pdfDoc); addSummary(doc,reader); addContent(doc,reader); if(pdfDoc != null) pdfDoc.close(); return doc; } catch (Exception e) { try { if(pdfDoc != null) pdfDoc.close(); } catch (Exception e2) { } throw new FactoryException(e); } } //////////////////////////////////////////////////////////////////////////// /** Set the type-field of this document. @param doc The document */ public void setType(Document doc) { doc.add(Field.UnIndexed(TYPE,"application/pdf")); } //////////////////////////////////////////////////////////////////////////// /** Create a PDDocument from the source-document. @param name The name of the source-document */ private PDDocument getPDFDocument(String name) throws Exception { FileInputStream is = new FileInputStream(name); PDDocument pdfDocument = null; InputStreamReader reader; try { PDFParser parser = new PDFParser(is); parser.parse(); pdfDocument = parser.getPDDocument(); if(pdfDocument.isEncrypted()) { DecryptDocument decryptor = new DecryptDocument(pdfDocument); //Just try using the default password and move on decryptor.decryptDocument(""); } } catch (Exception e) { } return pdfDocument; } //////////////////////////////////////////////////////////////////////////// /** Get a reader for the text-content of the PDF-document. @param pdfDoc The PDDocument */ private Reader getReader(PDDocument pdfDoc) throws IOException { ByteArrayOutputStream stream = new ByteArrayOutputStream(); OutputStreamWriter writer = new OutputStreamWriter(stream); PDFTextStripper stripper = new PDFTextStripper(); stripper.writeText(pdfDoc.getDocument(),writer); writer.close(); char[] contents = stream.toString().toCharArray(); stream.close(); return new CharArrayReader(contents); } //////////////////////////////////////////////////////////////////////////// /** Add additional (PDF-specific) fields to the document. @param doc The Lucene-document @param pdfDoc The-PDDocument */ private void addFields(Document doc,PDDocument pdfDoc) throws Exception { PDDocumentInformation info = pdfDoc.getDocumentInformation(); if(info.getAuthor() != null) doc.add(Field.Text(AUTHOR,info.getAuthor())); if(info.getCreationDate() != null) { Date date = info.getCreationDate().getTime(); if(date.getTime() >= 0) doc.add(Field.Text(DATE_CREATED,DateField.dateToString(date))); } if(info.getCreator() != null) doc.add(Field.Text("Creator",info.getCreator())); if(info.getKeywords() != null) doc.add(Field.Text(KEYWORDS,info.getKeywords())); if(info.getModificationDate() != null) { Date date = info.getModificationDate().getTime(); if(date.getTime() >= 0) doc.add(Field.Text(DATE_MOD,DateField.dateToString(date))); } if(info.getProducer() != null) doc.add(Field.Text("Producer",info.getProducer())); if(info.getSubject() != null) doc.add(Field.Text(SUBJECT,info.getSubject())); if(info.getTitle() != null) doc.add(Field.Text(TITLE,info.getTitle())); if(info.getTrapped() != null) doc.add(Field.Text("Trapped",info.getTrapped())); } }