--- a/components/indri/patches/remove_xpdf.patch Thu Dec 17 23:00:14 2015 -0800
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,358 +0,0 @@
-Remove xpdf support from the build.
---- indri-5.4/MakeDefns.in čt črc 4 15:01:17 2013
-+++ indri-5.4/MakeDefns.in čt črc 4 15:00:40 2013
-@@ -48,7 +48,7 @@
- PHPINCLUDE = @PHPINCLUDE@
- MCS=@MCS@
-
--DEPENDENCIES = lemur xpdf
-+DEPENDENCIES = lemur
- ifeq ($(NEED_ANTLR), 1)
- DEPENDENCIES += antlr
- endif
---- indri-5.4/src/PDFDocumentExtractor.cpp čt črc 4 15:08:46 2013
-+++ indri-5.4/src/PDFDocumentExtractor.cpp čt črc 4 15:08:28 2013
-@@ -1,214 +1,0 @@
--/*==========================================================================
-- * Copyright (c) 2003-2004 University of Massachusetts. All Rights Reserved.
-- *
-- * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
-- * is subject to the terms of the software license set forth in the LICENSE
-- * file included with this software, and also available at
-- * http://www.lemurproject.org/license.html
-- *
-- *==========================================================================
--*/
--
--
--//
--// PDFDocumentExtractor
--//
--// 25 June 2004 -- tds
--//
--
--#include "indri/PDFDocumentExtractor.hpp"
--#include "indri/Buffer.hpp"
--
--#include "GString.h"
--#include "TextOutputDev.h"
--#include "PDFDoc.h"
--
--#include "Object.h"
--#include "Stream.h"
--#include "Array.h"
--#include "Dict.h"
--#include "XRef.h"
--#include "Page.h"
--#include "CharTypes.h"
--#include "GlobalParams.h"
--#include "lemur/Exception.hpp"
--
--static void buffer_write( void* stream, char* text, int len ) {
-- indri::utility::Buffer* buffer = (indri::utility::Buffer*) stream;
--
-- if( buffer->position() ) {
-- buffer->unwrite(1);
-- }
--
-- memcpy( buffer->write(len), text, len );
-- if( text[len-1] != 0 )
-- *buffer->write(1) = 0;
--}
--
--indri::parse::PDFDocumentExtractor::PDFDocumentExtractor() {
-- globalParams = new GlobalParams(0);
-- _title="";
-- _author="";
--}
--
--indri::parse::PDFDocumentExtractor::~PDFDocumentExtractor() {
-- delete globalParams;
-- globalParams = 0;
--}
--
--
--void indri::parse::PDFDocumentExtractor::seekValue(indri::xml::XMLNode* node, std::string &metaTag) {
-- if (node == NULL) {
-- return;
-- }
--
-- const std::vector<indri::xml::XMLNode*>& children = node->getChildren();
-- for( size_t i=0; i<children.size(); i++ ) {
-- indri::xml::XMLNode* child = children[i];
-- metaTag = child->getValue();
-- if(metaTag.length()==0)
-- seekValue(child,metaTag);
-- else
-- return;
-- }
--
--}
--
--void indri::parse::PDFDocumentExtractor::appendPdfMetaData(indri::xml::XMLNode* node) {
-- indri::xml::XMLNode* current = 0;
--
-- if (node == NULL) {
-- return;
-- }
--
-- const std::vector<indri::xml::XMLNode*>& children = node->getChildren();
--
-- for( size_t i=0; i<children.size(); i++ ) {
-- indri::xml::XMLNode* child = children[i];
-- std::string name = child->getName();
-- if(name=="dccreator")
-- {
-- seekValue(child,_author);
-- }
-- if(name=="dctitle")
-- {
-- seekValue(child,_title);
-- }
-- appendPdfMetaData(child);
--
-- }
--
--
--
--}
--
--void indri::parse::PDFDocumentExtractor::open( const std::string& filename ) {
-- _documentTextBuffer.clear();
-- _documentPath = filename;
--}
--
--void indri::parse::PDFDocumentExtractor::close() {
-- _documentPath = "";
--}
--
--indri::parse::UnparsedDocument* indri::parse::PDFDocumentExtractor::nextDocument() {
-- if( !_documentPath.length() )
-- return 0;
--
-- PDFDoc* doc = 0;
-- TextOutputDev* textOut = 0;
-- GString* gfilename = new GString(_documentPath.c_str());
-- doc = new PDFDoc( gfilename );
-- // if the doc is not ok, or ok to copy, it
-- // will be a document of length 0.
-- if( doc->isOk() && doc->okToCopy() ) {
-- void* stream = &_documentTextBuffer;
-- textOut = new TextOutputDev( buffer_write, stream, gFalse, gFalse);
-- if ( textOut->isOk() ) {
-- int firstPage = 1;
-- int lastPage = doc->getNumPages();
-- double hDPI=72.0;
-- double vDPI=72.0;
-- int rotate=0;
-- GBool useMediaBox=gFalse;
-- GBool crop=gTrue;
-- GBool printing=gFalse;
-- if(doc->readMetadata()!=NULL)
-- {
-- GString rawMetaData = doc->readMetadata();
-- GString preparedMetaData="";
--
-- //zoek <rdf:RDF en eindig bij </rdf:RDF>!!
-- for(int x=0; x<rawMetaData.getLength(); x++) {
-- if(rawMetaData.getChar(x)!='?' && rawMetaData.getChar(x)!=':') {
-- //skip characters which the XMLReader doesn't understand
-- preparedMetaData.append(rawMetaData.getChar(x));
-- }
-- }
-- std::string metaData(preparedMetaData.getCString());
-- int startbegin = metaData.find("<rdf");
-- int stopend = metaData.find(">", metaData.rfind("</rdf") );
-- metaData = metaData.substr(startbegin, (stopend-startbegin)+1 );
--
--
-- indri::xml::XMLReader reader;
--
-- try {
-- std::auto_ptr<indri::xml::XMLNode> result( reader.read( metaData.c_str() ) );
-- appendPdfMetaData( result.get() );
-- } catch( lemur::api::Exception& e ) {
-- LEMUR_RETHROW( e, "Had trouble reading PDF metadata" );
-- }
-- if( _author.length()>0 || _title.length()>0 )
-- {
-- std::string createdPdfHeader;
-- createdPdfHeader="<head>\n";
-- if(_title.length()>0) {
-- createdPdfHeader+="<title>";
-- createdPdfHeader+=_title;
-- createdPdfHeader+="</title>\n";
-- }
-- if(_author.length()>0) {
-- createdPdfHeader+="<author>";
-- createdPdfHeader+=_author;
-- createdPdfHeader+="</author>\n";
-- }
-- createdPdfHeader+="</head>\n";
-- char *metastream = _documentTextBuffer.write( createdPdfHeader.length()+1 );
-- strcpy(metastream, createdPdfHeader.c_str());
-- }
-- }
-- doc->displayPages(textOut, firstPage, lastPage, hDPI, vDPI, rotate, useMediaBox, crop, printing);
-- }
-- }
--
--
-- delete textOut;
-- delete doc;
--
-- _unparsedDocument.textLength = _documentTextBuffer.position();
-- _unparsedDocument.contentLength = _unparsedDocument.textLength ? _documentTextBuffer.position() - 1 : 0 ; // no null 0 if text is empty.
-- char* docnoPoint = _documentTextBuffer.write( _documentPath.length()+1 );
-- strcpy( docnoPoint, _documentPath.c_str() );
-- _unparsedDocument.text = _documentTextBuffer.front();
-- _unparsedDocument.content = _documentTextBuffer.front();
-- _unparsedDocument.metadata.clear();
--
-- indri::parse::MetadataPair pair;
--
-- pair.key = "path";
-- pair.value = docnoPoint;
-- pair.valueLength = _documentPath.length()+1;
-- _unparsedDocument.metadata.push_back( pair );
--
-- _docnostring.assign(_documentPath.c_str() );
-- cleanDocno();
-- pair.value = _docnostring.c_str();
-- pair.valueLength = _docnostring.length()+1;
-- pair.key = "docno";
-- _unparsedDocument.metadata.push_back( pair );
--
-- _documentPath = "";
--
-- return &_unparsedDocument;
--}
---- indri-5.4/include/indri/PDFDocumentExtractor.hpp čt črc 4 15:16:04 2013
-+++ indri-5.4/include/indri/PDFDocumentExtractor.hpp čt črc 4 15:15:00 2013
-@@ -1,57 +1,0 @@
--/*==========================================================================
-- * Copyright (c) 2003-2004 University of Massachusetts. All Rights Reserved.
-- *
-- * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
-- * is subject to the terms of the software license set forth in the LICENSE
-- * file included with this software, and also available at
-- * http://www.lemurproject.org/license.html
-- *
-- *==========================================================================
-- */
--
--
--//
--// PDFDocumentExtractor
--//
--// 25 June 2004 -- tds
--//
--
--#ifndef INDRI_PDFDOCUMENTEXTRACTOR_HPP
--#define INDRI_PDFDOCUMENTEXTRACTOR_HPP
--
--#include "lemur/lemur-compat.hpp"
--#include "indri/Buffer.hpp"
--#include "indri/UnparsedDocument.hpp"
--#include "indri/DocumentIterator.hpp"
--#include "indri/XMLReader.hpp"
--#include "indri/XMLNode.hpp"
--#include "indri/XMLWriter.hpp"
--#include <string>
--namespace indri
--{
-- namespace parse
-- {
--
-- class PDFDocumentExtractor : public DocumentIterator {
-- indri::utility::Buffer _documentTextBuffer;
-- UnparsedDocument _unparsedDocument;
-- std::string _documentPath;
--
-- public:
-- PDFDocumentExtractor();
-- ~PDFDocumentExtractor();
--
-- void open( const std::string& filename );
-- UnparsedDocument* nextDocument();
-- void appendPdfMetaData(indri::xml::XMLNode* node);
-- void seekValue(indri::xml::XMLNode* node, std::string &metaTag);
-- void close();
-- private:
-- std::string _title;
-- std::string _author;
--
-- };
-- }
--}
--
--#endif // INDRI_PDFDOCUMENTEXTRACTOR_HPP
---- indri-5.4/src/DocumentIteratorFactory.cpp čt črc 4 15:24:24 2013
-+++ indri-5.4/src/DocumentIteratorFactory.cpp čt črc 4 15:23:27 2013
-@@ -18,7 +18,6 @@
-
- #include "indri/DocumentIteratorFactory.hpp"
-
--#include "indri/PDFDocumentExtractor.hpp"
- #include "indri/TaggedDocumentIterator.hpp"
- #include "indri/WARCDocumentIterator.hpp"
- #include "indri/TextDocumentExtractor.hpp"
-@@ -36,7 +35,6 @@
-
- #define TYPE_TAGGED ( "Tagged Document Collection" )
- #define TYPE_WARC ( "WARC Document Collection" )
--#define TYPE_PDF ( "Adobe PDF" )
- #define TYPE_WORD ( "Microsoft Word" )
- #define TYPE_PPT ( "Microsoft PowerPoint" )
- #define TYPE_MBOX ( "Mailbox" )
-@@ -53,8 +51,6 @@
- result = iter;
- } else if( preferred == TYPE_WARC ) {
- result = new indri::parse::WARCDocumentIterator();
-- } else if( preferred == TYPE_PDF ) {
-- result = new indri::parse::PDFDocumentExtractor();
- } else if( preferred == TYPE_TEXT ) {
- result = new indri::parse::TextDocumentExtractor();
- } else if( preferred == TYPE_MBOX ) {
-@@ -83,8 +79,6 @@
- return TYPE_TAGGED;
- } else if( type == "warc" || type == TYPE_WARC ) {
- return TYPE_WARC;
-- } else if( type == "pdf" || type == "adobe pdf" || type == TYPE_PDF ) {
-- return TYPE_PDF;
- } else if( type == "doc" || type == "msword" || type == "word" || type == "microsoft word" || type == TYPE_WORD ) {
- return TYPE_WORD;
- } else if( type == "ppt" || type == "powerpoint" || type == "msppt" || type == "microsoft powerpoint" || type == TYPE_PPT ) {
---- indri-5.4/src/FileClassEnvironmentFactory.cpp čt črc 4 15:33:56 2013
-+++ indri-5.4/src/FileClassEnvironmentFactory.cpp čt črc 4 15:33:20 2013
-@@ -55,8 +55,6 @@
- // case. Values specified here can be in mixed case, since values are
- // matched in a case-sensitive manner.
-
--static const char* pdf_index_tags[] = { "title", "author", 0 };
--static const char* pdf_metadata_tags[] = { "title", "author", 0 };
- static const char* html_index_tags[] = { "title", "author", "h1", "h2", "h3", "h4", 0 };
- static const char* html_metadata_tags[] = { "title", "author", 0 };
- //static const char* html_conflations[] = { "h1", NULL, NULL, "heading", "h2", NULL, NULL, "heading", "h3", NULL, NULL, "heading", "h4", NULL, NULL, "heading", "bloghpno", NULL, NULL, "docno", 0, 0, 0, 0 };
-@@ -279,21 +277,6 @@
- #endif
-
- {
-- "pdf", // name
-- "html", // parser
-- "word", // tokenizer
-- "pdf", // iterator
-- NULL, // startDocTag
-- NULL, // endDocTag
-- NULL, // endMetadataTag
-- NULL, // includeTags
-- NULL, // excludeTags
-- pdf_index_tags, // indexTags
-- pdf_metadata_tags, // metadataTags
-- NULL // conflations
-- },
--
-- {
- "txt", // name
- "text", // parser
- "word", // tokenizer