Remove xpdf support from the build.
--- indri-5.4/MakeDefns.in čt črc 4 15:01:17 2013
+++ indri-5.4/MakeDefns.in čt črc 4 15:00:40 2013
@@ -48,7 +48,7 @@
PHPINCLUDE = @PHPINCLUDE@
MCS=@MCS@
-DEPENDENCIES = lemur xpdf
+DEPENDENCIES = lemur
ifeq ($(NEED_ANTLR), 1)
DEPENDENCIES += antlr
endif
--- indri-5.4/src/PDFDocumentExtractor.cpp čt črc 4 15:08:46 2013
+++ indri-5.4/src/PDFDocumentExtractor.cpp čt črc 4 15:08:28 2013
@@ -1,214 +1,0 @@
-/*==========================================================================
- * Copyright (c) 2003-2004 University of Massachusetts. All Rights Reserved.
- *
- * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
- * is subject to the terms of the software license set forth in the LICENSE
- * file included with this software, and also available at
- * http://www.lemurproject.org/license.html
- *
- *==========================================================================
-*/
-
-
-//
-// PDFDocumentExtractor
-//
-// 25 June 2004 -- tds
-//
-
-#include "indri/PDFDocumentExtractor.hpp"
-#include "indri/Buffer.hpp"
-
-#include "GString.h"
-#include "TextOutputDev.h"
-#include "PDFDoc.h"
-
-#include "Object.h"
-#include "Stream.h"
-#include "Array.h"
-#include "Dict.h"
-#include "XRef.h"
-#include "Page.h"
-#include "CharTypes.h"
-#include "GlobalParams.h"
-#include "lemur/Exception.hpp"
-
-static void buffer_write( void* stream, char* text, int len ) {
- indri::utility::Buffer* buffer = (indri::utility::Buffer*) stream;
-
- if( buffer->position() ) {
- buffer->unwrite(1);
- }
-
- memcpy( buffer->write(len), text, len );
- if( text[len-1] != 0 )
- *buffer->write(1) = 0;
-}
-
-indri::parse::PDFDocumentExtractor::PDFDocumentExtractor() {
- globalParams = new GlobalParams(0);
- _title="";
- _author="";
-}
-
-indri::parse::PDFDocumentExtractor::~PDFDocumentExtractor() {
- delete globalParams;
- globalParams = 0;
-}
-
-
-void indri::parse::PDFDocumentExtractor::seekValue(indri::xml::XMLNode* node, std::string &metaTag) {
- if (node == NULL) {
- return;
- }
-
- const std::vector<indri::xml::XMLNode*>& children = node->getChildren();
- for( size_t i=0; i<children.size(); i++ ) {
- indri::xml::XMLNode* child = children[i];
- metaTag = child->getValue();
- if(metaTag.length()==0)
- seekValue(child,metaTag);
- else
- return;
- }
-
-}
-
-void indri::parse::PDFDocumentExtractor::appendPdfMetaData(indri::xml::XMLNode* node) {
- indri::xml::XMLNode* current = 0;
-
- if (node == NULL) {
- return;
- }
-
- const std::vector<indri::xml::XMLNode*>& children = node->getChildren();
-
- for( size_t i=0; i<children.size(); i++ ) {
- indri::xml::XMLNode* child = children[i];
- std::string name = child->getName();
- if(name=="dccreator")
- {
- seekValue(child,_author);
- }
- if(name=="dctitle")
- {
- seekValue(child,_title);
- }
- appendPdfMetaData(child);
-
- }
-
-
-
-}
-
-void indri::parse::PDFDocumentExtractor::open( const std::string& filename ) {
- _documentTextBuffer.clear();
- _documentPath = filename;
-}
-
-void indri::parse::PDFDocumentExtractor::close() {
- _documentPath = "";
-}
-
-indri::parse::UnparsedDocument* indri::parse::PDFDocumentExtractor::nextDocument() {
- if( !_documentPath.length() )
- return 0;
-
- PDFDoc* doc = 0;
- TextOutputDev* textOut = 0;
- GString* gfilename = new GString(_documentPath.c_str());
- doc = new PDFDoc( gfilename );
- // if the doc is not ok, or ok to copy, it
- // will be a document of length 0.
- if( doc->isOk() && doc->okToCopy() ) {
- void* stream = &_documentTextBuffer;
- textOut = new TextOutputDev( buffer_write, stream, gFalse, gFalse);
- if ( textOut->isOk() ) {
- int firstPage = 1;
- int lastPage = doc->getNumPages();
- double hDPI=72.0;
- double vDPI=72.0;
- int rotate=0;
- GBool useMediaBox=gFalse;
- GBool crop=gTrue;
- GBool printing=gFalse;
- if(doc->readMetadata()!=NULL)
- {
- GString rawMetaData = doc->readMetadata();
- GString preparedMetaData="";
-
- //zoek <rdf:RDF en eindig bij </rdf:RDF>!!
- for(int x=0; x<rawMetaData.getLength(); x++) {
- if(rawMetaData.getChar(x)!='?' && rawMetaData.getChar(x)!=':') {
- //skip characters which the XMLReader doesn't understand
- preparedMetaData.append(rawMetaData.getChar(x));
- }
- }
- std::string metaData(preparedMetaData.getCString());
- int startbegin = metaData.find("<rdf");
- int stopend = metaData.find(">", metaData.rfind("</rdf") );
- metaData = metaData.substr(startbegin, (stopend-startbegin)+1 );
-
-
- indri::xml::XMLReader reader;
-
- try {
- std::auto_ptr<indri::xml::XMLNode> result( reader.read( metaData.c_str() ) );
- appendPdfMetaData( result.get() );
- } catch( lemur::api::Exception& e ) {
- LEMUR_RETHROW( e, "Had trouble reading PDF metadata" );
- }
- if( _author.length()>0 || _title.length()>0 )
- {
- std::string createdPdfHeader;
- createdPdfHeader="<head>\n";
- if(_title.length()>0) {
- createdPdfHeader+="<title>";
- createdPdfHeader+=_title;
- createdPdfHeader+="</title>\n";
- }
- if(_author.length()>0) {
- createdPdfHeader+="<author>";
- createdPdfHeader+=_author;
- createdPdfHeader+="</author>\n";
- }
- createdPdfHeader+="</head>\n";
- char *metastream = _documentTextBuffer.write( createdPdfHeader.length()+1 );
- strcpy(metastream, createdPdfHeader.c_str());
- }
- }
- doc->displayPages(textOut, firstPage, lastPage, hDPI, vDPI, rotate, useMediaBox, crop, printing);
- }
- }
-
-
- delete textOut;
- delete doc;
-
- _unparsedDocument.textLength = _documentTextBuffer.position();
- _unparsedDocument.contentLength = _unparsedDocument.textLength ? _documentTextBuffer.position() - 1 : 0 ; // no null 0 if text is empty.
- char* docnoPoint = _documentTextBuffer.write( _documentPath.length()+1 );
- strcpy( docnoPoint, _documentPath.c_str() );
- _unparsedDocument.text = _documentTextBuffer.front();
- _unparsedDocument.content = _documentTextBuffer.front();
- _unparsedDocument.metadata.clear();
-
- indri::parse::MetadataPair pair;
-
- pair.key = "path";
- pair.value = docnoPoint;
- pair.valueLength = _documentPath.length()+1;
- _unparsedDocument.metadata.push_back( pair );
-
- _docnostring.assign(_documentPath.c_str() );
- cleanDocno();
- pair.value = _docnostring.c_str();
- pair.valueLength = _docnostring.length()+1;
- pair.key = "docno";
- _unparsedDocument.metadata.push_back( pair );
-
- _documentPath = "";
-
- return &_unparsedDocument;
-}
--- indri-5.4/include/indri/PDFDocumentExtractor.hpp čt črc 4 15:16:04 2013
+++ indri-5.4/include/indri/PDFDocumentExtractor.hpp čt črc 4 15:15:00 2013
@@ -1,57 +1,0 @@
-/*==========================================================================
- * Copyright (c) 2003-2004 University of Massachusetts. All Rights Reserved.
- *
- * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
- * is subject to the terms of the software license set forth in the LICENSE
- * file included with this software, and also available at
- * http://www.lemurproject.org/license.html
- *
- *==========================================================================
- */
-
-
-//
-// PDFDocumentExtractor
-//
-// 25 June 2004 -- tds
-//
-
-#ifndef INDRI_PDFDOCUMENTEXTRACTOR_HPP
-#define INDRI_PDFDOCUMENTEXTRACTOR_HPP
-
-#include "lemur/lemur-compat.hpp"
-#include "indri/Buffer.hpp"
-#include "indri/UnparsedDocument.hpp"
-#include "indri/DocumentIterator.hpp"
-#include "indri/XMLReader.hpp"
-#include "indri/XMLNode.hpp"
-#include "indri/XMLWriter.hpp"
-#include <string>
-namespace indri
-{
- namespace parse
- {
-
- class PDFDocumentExtractor : public DocumentIterator {
- indri::utility::Buffer _documentTextBuffer;
- UnparsedDocument _unparsedDocument;
- std::string _documentPath;
-
- public:
- PDFDocumentExtractor();
- ~PDFDocumentExtractor();
-
- void open( const std::string& filename );
- UnparsedDocument* nextDocument();
- void appendPdfMetaData(indri::xml::XMLNode* node);
- void seekValue(indri::xml::XMLNode* node, std::string &metaTag);
- void close();
- private:
- std::string _title;
- std::string _author;
-
- };
- }
-}
-
-#endif // INDRI_PDFDOCUMENTEXTRACTOR_HPP
--- indri-5.4/src/DocumentIteratorFactory.cpp čt črc 4 15:24:24 2013
+++ indri-5.4/src/DocumentIteratorFactory.cpp čt črc 4 15:23:27 2013
@@ -18,7 +18,6 @@
#include "indri/DocumentIteratorFactory.hpp"
-#include "indri/PDFDocumentExtractor.hpp"
#include "indri/TaggedDocumentIterator.hpp"
#include "indri/WARCDocumentIterator.hpp"
#include "indri/TextDocumentExtractor.hpp"
@@ -36,7 +35,6 @@
#define TYPE_TAGGED ( "Tagged Document Collection" )
#define TYPE_WARC ( "WARC Document Collection" )
-#define TYPE_PDF ( "Adobe PDF" )
#define TYPE_WORD ( "Microsoft Word" )
#define TYPE_PPT ( "Microsoft PowerPoint" )
#define TYPE_MBOX ( "Mailbox" )
@@ -53,8 +51,6 @@
result = iter;
} else if( preferred == TYPE_WARC ) {
result = new indri::parse::WARCDocumentIterator();
- } else if( preferred == TYPE_PDF ) {
- result = new indri::parse::PDFDocumentExtractor();
} else if( preferred == TYPE_TEXT ) {
result = new indri::parse::TextDocumentExtractor();
} else if( preferred == TYPE_MBOX ) {
@@ -83,8 +79,6 @@
return TYPE_TAGGED;
} else if( type == "warc" || type == TYPE_WARC ) {
return TYPE_WARC;
- } else if( type == "pdf" || type == "adobe pdf" || type == TYPE_PDF ) {
- return TYPE_PDF;
} else if( type == "doc" || type == "msword" || type == "word" || type == "microsoft word" || type == TYPE_WORD ) {
return TYPE_WORD;
} else if( type == "ppt" || type == "powerpoint" || type == "msppt" || type == "microsoft powerpoint" || type == TYPE_PPT ) {
--- indri-5.4/src/FileClassEnvironmentFactory.cpp čt črc 4 15:33:56 2013
+++ indri-5.4/src/FileClassEnvironmentFactory.cpp čt črc 4 15:33:20 2013
@@ -55,8 +55,6 @@
// case. Values specified here can be in mixed case, since values are
// matched in a case-sensitive manner.
-static const char* pdf_index_tags[] = { "title", "author", 0 };
-static const char* pdf_metadata_tags[] = { "title", "author", 0 };
static const char* html_index_tags[] = { "title", "author", "h1", "h2", "h3", "h4", 0 };
static const char* html_metadata_tags[] = { "title", "author", 0 };
//static const char* html_conflations[] = { "h1", NULL, NULL, "heading", "h2", NULL, NULL, "heading", "h3", NULL, NULL, "heading", "h4", NULL, NULL, "heading", "bloghpno", NULL, NULL, "docno", 0, 0, 0, 0 };
@@ -279,21 +277,6 @@
#endif
{
- "pdf", // name
- "html", // parser
- "word", // tokenizer
- "pdf", // iterator
- NULL, // startDocTag
- NULL, // endDocTag
- NULL, // endMetadataTag
- NULL, // includeTags
- NULL, // excludeTags
- pdf_index_tags, // indexTags
- pdf_metadata_tags, // metadataTags
- NULL // conflations
- },
-
- {
"txt", // name
"text", // parser
"word", // tokenizer