components/indri/patches/remove_xpdf.patch
changeset 5215 7fe3e5b43e6a
parent 5214 f6336578f3e5
child 5216 fdd262355907
--- a/components/indri/patches/remove_xpdf.patch	Thu Dec 17 23:00:14 2015 -0800
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,358 +0,0 @@
-Remove xpdf support from the build.
---- indri-5.4/MakeDefns.in	čt črc  4 15:01:17 2013
-+++ indri-5.4/MakeDefns.in	čt črc  4 15:00:40 2013
-@@ -48,7 +48,7 @@
- PHPINCLUDE = @PHPINCLUDE@
- MCS=@MCS@
- 
--DEPENDENCIES = lemur xpdf
-+DEPENDENCIES = lemur
- ifeq ($(NEED_ANTLR), 1)
-   DEPENDENCIES += antlr
- endif
---- indri-5.4/src/PDFDocumentExtractor.cpp	čt črc  4 15:08:46 2013
-+++ indri-5.4/src/PDFDocumentExtractor.cpp	čt črc  4 15:08:28 2013
-@@ -1,214 +1,0 @@
--/*==========================================================================
-- * Copyright (c) 2003-2004 University of Massachusetts.  All Rights Reserved.
-- *
-- * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
-- * is subject to the terms of the software license set forth in the LICENSE
-- * file included with this software, and also available at
-- * http://www.lemurproject.org/license.html
-- *
-- *==========================================================================
--*/
--
--
--//
--// PDFDocumentExtractor
--//
--// 25 June 2004 -- tds
--//
--
--#include "indri/PDFDocumentExtractor.hpp"
--#include "indri/Buffer.hpp"
--
--#include "GString.h"
--#include "TextOutputDev.h"
--#include "PDFDoc.h"
--
--#include "Object.h"
--#include "Stream.h"
--#include "Array.h"
--#include "Dict.h"
--#include "XRef.h"
--#include "Page.h"
--#include "CharTypes.h"
--#include "GlobalParams.h"
--#include "lemur/Exception.hpp"
--
--static void buffer_write( void* stream, char* text, int len ) {
--  indri::utility::Buffer* buffer = (indri::utility::Buffer*) stream;
--
--  if( buffer->position() ) {
--    buffer->unwrite(1);
--  }
--
--  memcpy( buffer->write(len), text, len );
--  if( text[len-1] != 0 )
--    *buffer->write(1) = 0;
--}
--
--indri::parse::PDFDocumentExtractor::PDFDocumentExtractor() {
--  globalParams = new GlobalParams(0);
--  _title="";
--  _author="";
--}
--
--indri::parse::PDFDocumentExtractor::~PDFDocumentExtractor() {
--  delete globalParams;
--  globalParams = 0;
--}
--
--
--void indri::parse::PDFDocumentExtractor::seekValue(indri::xml::XMLNode* node, std::string &metaTag) {
--  if (node == NULL) {
--    return;
--  }
--
--  const std::vector<indri::xml::XMLNode*>& children = node->getChildren();
--  for( size_t i=0; i<children.size(); i++ ) {
--    indri::xml::XMLNode* child = children[i];
--    metaTag = child->getValue();
--	if(metaTag.length()==0)
--		seekValue(child,metaTag);
--	else
--		return;
--  }
--
--}
--
--void indri::parse::PDFDocumentExtractor::appendPdfMetaData(indri::xml::XMLNode* node) {
--  indri::xml::XMLNode* current = 0;
--
--  if (node == NULL) {
--    return;
--  }
--
--  const std::vector<indri::xml::XMLNode*>& children = node->getChildren();
--
--  for( size_t i=0; i<children.size(); i++ ) {
--    indri::xml::XMLNode* child = children[i];
--    std::string name = child->getName();
--	if(name=="dccreator")
--	{
--		seekValue(child,_author);
--	}
--	if(name=="dctitle")
--	{
--		seekValue(child,_title);
--	}
--	appendPdfMetaData(child);
--
--  }
--
--
--
--}
--
--void indri::parse::PDFDocumentExtractor::open( const std::string& filename ) {
--  _documentTextBuffer.clear();
--  _documentPath = filename;
--}
--
--void indri::parse::PDFDocumentExtractor::close() {
--  _documentPath = "";
--}
--
--indri::parse::UnparsedDocument* indri::parse::PDFDocumentExtractor::nextDocument() {
--  if( !_documentPath.length() )
--    return 0;
--
--  PDFDoc* doc = 0;
--  TextOutputDev* textOut = 0;
--  GString* gfilename = new GString(_documentPath.c_str());
--  doc = new PDFDoc( gfilename );
--  // if the doc is not ok, or ok to copy, it
--  // will be a document of length 0.
--  if( doc->isOk() && doc->okToCopy() ) {
--    void* stream = &_documentTextBuffer;
--    textOut = new TextOutputDev( buffer_write, stream, gFalse, gFalse);
--    if ( textOut->isOk() ) {
--      int firstPage = 1;
--      int lastPage = doc->getNumPages();
--	  double hDPI=72.0;
--	  double vDPI=72.0;
--	  int rotate=0;
--	  GBool useMediaBox=gFalse;
--	  GBool crop=gTrue; 
--	  GBool printing=gFalse; 
--	  if(doc->readMetadata()!=NULL)
--	  {
--		  GString rawMetaData = doc->readMetadata();
--		  GString preparedMetaData="";
--
--		  //zoek <rdf:RDF  en eindig bij </rdf:RDF>!! 
--		  for(int x=0; x<rawMetaData.getLength(); x++) {
--			  if(rawMetaData.getChar(x)!='?' && rawMetaData.getChar(x)!=':') {
--				  //skip characters which the XMLReader doesn't understand
--				  preparedMetaData.append(rawMetaData.getChar(x));
--			  }
--		  }
--		  std::string metaData(preparedMetaData.getCString());
--		  int startbegin = metaData.find("<rdf");
--		  int stopend = metaData.find(">", metaData.rfind("</rdf") );
--		  metaData = metaData.substr(startbegin, (stopend-startbegin)+1 );
--	  
--
--     	  indri::xml::XMLReader reader;
--
--		  try {
--			  std::auto_ptr<indri::xml::XMLNode> result( reader.read( metaData.c_str() ) );
--			  appendPdfMetaData( result.get() );
--		  } catch( lemur::api::Exception& e ) {
--			LEMUR_RETHROW( e, "Had trouble reading PDF metadata" );
--		  } 
--		  if( _author.length()>0 || _title.length()>0 )
--		  {
--			std::string createdPdfHeader;
--			createdPdfHeader="<head>\n";
--			if(_title.length()>0) {
--				createdPdfHeader+="<title>";
--				createdPdfHeader+=_title;
--				createdPdfHeader+="</title>\n";
--			}
--			if(_author.length()>0) {
--				createdPdfHeader+="<author>";
--				createdPdfHeader+=_author;
--				createdPdfHeader+="</author>\n";
--			}
--			createdPdfHeader+="</head>\n";
--			char *metastream = _documentTextBuffer.write( createdPdfHeader.length()+1 );
--			strcpy(metastream, createdPdfHeader.c_str());
--		  }
--	  }
--      doc->displayPages(textOut, firstPage, lastPage, hDPI, vDPI, rotate, useMediaBox, crop, printing);
--    }
--  }
--  
--
--  delete textOut;
--  delete doc;
--
--  _unparsedDocument.textLength = _documentTextBuffer.position();
--  _unparsedDocument.contentLength = _unparsedDocument.textLength ? _documentTextBuffer.position() - 1 : 0 ; // no null 0 if text is empty.
--  char* docnoPoint = _documentTextBuffer.write( _documentPath.length()+1 );
--  strcpy( docnoPoint, _documentPath.c_str() );
--  _unparsedDocument.text = _documentTextBuffer.front();
--  _unparsedDocument.content = _documentTextBuffer.front();
--  _unparsedDocument.metadata.clear();
--
--  indri::parse::MetadataPair pair;
--
--  pair.key = "path";
--  pair.value = docnoPoint;
--  pair.valueLength = _documentPath.length()+1;
--  _unparsedDocument.metadata.push_back( pair );
--
--  _docnostring.assign(_documentPath.c_str() );
--  cleanDocno();
--  pair.value = _docnostring.c_str();
--  pair.valueLength = _docnostring.length()+1;
--  pair.key = "docno";
--  _unparsedDocument.metadata.push_back( pair );
-- 
--  _documentPath = "";
--
--  return &_unparsedDocument;
--}
---- indri-5.4/include/indri/PDFDocumentExtractor.hpp	čt črc  4 15:16:04 2013
-+++ indri-5.4/include/indri/PDFDocumentExtractor.hpp	čt črc  4 15:15:00 2013
-@@ -1,57 +1,0 @@
--/*==========================================================================
-- * Copyright (c) 2003-2004 University of Massachusetts.  All Rights Reserved.
-- *
-- * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
-- * is subject to the terms of the software license set forth in the LICENSE
-- * file included with this software, and also available at
-- * http://www.lemurproject.org/license.html
-- *
-- *==========================================================================
-- */
--
--
--//
--// PDFDocumentExtractor
--//
--// 25 June 2004 -- tds
--//
--
--#ifndef INDRI_PDFDOCUMENTEXTRACTOR_HPP
--#define INDRI_PDFDOCUMENTEXTRACTOR_HPP
--
--#include "lemur/lemur-compat.hpp"
--#include "indri/Buffer.hpp"
--#include "indri/UnparsedDocument.hpp"
--#include "indri/DocumentIterator.hpp"
--#include "indri/XMLReader.hpp"
--#include "indri/XMLNode.hpp"
--#include "indri/XMLWriter.hpp"
--#include <string>
--namespace indri
--{
--  namespace parse
--  {
--    
--    class PDFDocumentExtractor : public DocumentIterator {
--      indri::utility::Buffer _documentTextBuffer;
--      UnparsedDocument _unparsedDocument;
--      std::string _documentPath;
--  
--    public:
--      PDFDocumentExtractor();
--      ~PDFDocumentExtractor();
--
--      void open( const std::string& filename );
--      UnparsedDocument* nextDocument();
--	  void appendPdfMetaData(indri::xml::XMLNode* node);
--	  void seekValue(indri::xml::XMLNode* node, std::string &metaTag);
--      void close();
--	private:
--	  std::string _title;
--	  std::string _author;
--
--    };
--  }
--}
--
--#endif // INDRI_PDFDOCUMENTEXTRACTOR_HPP
---- indri-5.4/src/DocumentIteratorFactory.cpp	čt črc  4 15:24:24 2013
-+++ indri-5.4/src/DocumentIteratorFactory.cpp	čt črc  4 15:23:27 2013
-@@ -18,7 +18,6 @@
- 
- #include "indri/DocumentIteratorFactory.hpp"
- 
--#include "indri/PDFDocumentExtractor.hpp"
- #include "indri/TaggedDocumentIterator.hpp"
- #include "indri/WARCDocumentIterator.hpp"
- #include "indri/TextDocumentExtractor.hpp"
-@@ -36,7 +35,6 @@
- 
- #define TYPE_TAGGED   ( "Tagged Document Collection" )
- #define TYPE_WARC     ( "WARC Document Collection" )
--#define TYPE_PDF      ( "Adobe PDF" )
- #define TYPE_WORD     ( "Microsoft Word" )
- #define TYPE_PPT      ( "Microsoft PowerPoint" )
- #define TYPE_MBOX     ( "Mailbox" )
-@@ -53,8 +51,6 @@
-     result = iter;
-   } else if( preferred == TYPE_WARC ) {
-     result = new indri::parse::WARCDocumentIterator();
--  } else if( preferred == TYPE_PDF ) {
--    result = new indri::parse::PDFDocumentExtractor();
-   } else if( preferred == TYPE_TEXT ) {
-     result = new indri::parse::TextDocumentExtractor();
-   } else if( preferred == TYPE_MBOX ) {
-@@ -83,8 +79,6 @@
-     return TYPE_TAGGED;
-   } else if( type == "warc" || type == TYPE_WARC ) {
-     return TYPE_WARC;
--  } else if( type == "pdf" || type == "adobe pdf" || type == TYPE_PDF ) {
--    return TYPE_PDF;
-   } else if( type == "doc" || type == "msword" || type == "word" || type == "microsoft word" || type == TYPE_WORD ) {
-     return TYPE_WORD;
-   } else if( type == "ppt" || type == "powerpoint" || type == "msppt" || type == "microsoft powerpoint" || type == TYPE_PPT ) {
---- indri-5.4/src/FileClassEnvironmentFactory.cpp	čt črc  4 15:33:56 2013
-+++ indri-5.4/src/FileClassEnvironmentFactory.cpp	čt črc  4 15:33:20 2013
-@@ -55,8 +55,6 @@
- // case.  Values specified here can be in mixed case, since values are
- // matched in a case-sensitive manner.
- 
--static const char* pdf_index_tags[] = { "title", "author", 0 };
--static const char* pdf_metadata_tags[] = { "title", "author", 0 };
- static const char* html_index_tags[] = { "title", "author", "h1", "h2", "h3", "h4", 0 };
- static const char* html_metadata_tags[] = { "title", "author", 0 };
- //static const char* html_conflations[] = { "h1", NULL, NULL, "heading", "h2", NULL, NULL, "heading", "h3", NULL, NULL, "heading", "h4", NULL, NULL, "heading", "bloghpno", NULL, NULL, "docno", 0, 0, 0, 0 };
-@@ -279,21 +277,6 @@
- #endif
- 
-   {
--    "pdf",                // name
--    "html",               // parser
--    "word",               // tokenizer
--    "pdf",                // iterator
--    NULL,                 // startDocTag
--    NULL,                 // endDocTag
--    NULL,                 // endMetadataTag
--    NULL,                 // includeTags
--    NULL,                 // excludeTags
--    pdf_index_tags,       // indexTags
--    pdf_metadata_tags,    // metadataTags
--    NULL                  // conflations
--  },
--
--  {
-     "txt",                // name
-     "text",               // parser
-     "word",               // tokenizer