components/indri/patches/remove_xpdf.patch
author Shawn Walker-Salas <shawn.walker@oracle.com>
Mon, 23 Nov 2015 15:33:43 -0800
changeset 5125 34cc580c62c2
parent 1626 8dee2dfe2525
permissions -rw-r--r--
21029732 PKG_CONFIG_PATH should be included in CONFIGURE_ENV and BUILD_ENV 21029735 shared-macros should define service manifest and method macros 21093823 sample-manifest should omit or comment "standard" directories and certain files 21144358 configure scripts usually detect wrong host for 64-bit builds 21157847 shared-macros.mk instructions have a typo for studio c99 mode 22067225 common make-rules desired for simplifying makefiles 22067806 transforms needs expanding for desktop services
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
1626
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
     1
Remove xpdf support from the build.
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
     2
--- indri-5.4/MakeDefns.in	čt črc  4 15:01:17 2013
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
     3
+++ indri-5.4/MakeDefns.in	čt črc  4 15:00:40 2013
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
     4
@@ -48,7 +48,7 @@
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
     5
 PHPINCLUDE = @PHPINCLUDE@
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
     6
 MCS=@MCS@
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
     7
 
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
     8
-DEPENDENCIES = lemur xpdf
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
     9
+DEPENDENCIES = lemur
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    10
 ifeq ($(NEED_ANTLR), 1)
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    11
   DEPENDENCIES += antlr
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    12
 endif
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    13
--- indri-5.4/src/PDFDocumentExtractor.cpp	čt črc  4 15:08:46 2013
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    14
+++ indri-5.4/src/PDFDocumentExtractor.cpp	čt črc  4 15:08:28 2013
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    15
@@ -1,214 +1,0 @@
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    16
-/*==========================================================================
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    17
- * Copyright (c) 2003-2004 University of Massachusetts.  All Rights Reserved.
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    18
- *
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    19
- * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    20
- * is subject to the terms of the software license set forth in the LICENSE
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    21
- * file included with this software, and also available at
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    22
- * http://www.lemurproject.org/license.html
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    23
- *
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    24
- *==========================================================================
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    25
-*/
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    26
-
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    27
-
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    28
-//
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    29
-// PDFDocumentExtractor
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    30
-//
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    31
-// 25 June 2004 -- tds
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    32
-//
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    33
-
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    34
-#include "indri/PDFDocumentExtractor.hpp"
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    35
-#include "indri/Buffer.hpp"
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    36
-
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    37
-#include "GString.h"
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    38
-#include "TextOutputDev.h"
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    39
-#include "PDFDoc.h"
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    40
-
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    41
-#include "Object.h"
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    42
-#include "Stream.h"
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    43
-#include "Array.h"
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    44
-#include "Dict.h"
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    45
-#include "XRef.h"
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    46
-#include "Page.h"
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    47
-#include "CharTypes.h"
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    48
-#include "GlobalParams.h"
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    49
-#include "lemur/Exception.hpp"
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    50
-
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    51
-static void buffer_write( void* stream, char* text, int len ) {
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    52
-  indri::utility::Buffer* buffer = (indri::utility::Buffer*) stream;
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    53
-
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    54
-  if( buffer->position() ) {
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    55
-    buffer->unwrite(1);
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    56
-  }
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    57
-
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    58
-  memcpy( buffer->write(len), text, len );
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    59
-  if( text[len-1] != 0 )
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    60
-    *buffer->write(1) = 0;
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    61
-}
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    62
-
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    63
-indri::parse::PDFDocumentExtractor::PDFDocumentExtractor() {
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    64
-  globalParams = new GlobalParams(0);
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    65
-  _title="";
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    66
-  _author="";
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    67
-}
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    68
-
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    69
-indri::parse::PDFDocumentExtractor::~PDFDocumentExtractor() {
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    70
-  delete globalParams;
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    71
-  globalParams = 0;
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    72
-}
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    73
-
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    74
-
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    75
-void indri::parse::PDFDocumentExtractor::seekValue(indri::xml::XMLNode* node, std::string &metaTag) {
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    76
-  if (node == NULL) {
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    77
-    return;
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    78
-  }
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    79
-
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    80
-  const std::vector<indri::xml::XMLNode*>& children = node->getChildren();
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    81
-  for( size_t i=0; i<children.size(); i++ ) {
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    82
-    indri::xml::XMLNode* child = children[i];
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    83
-    metaTag = child->getValue();
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    84
-	if(metaTag.length()==0)
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    85
-		seekValue(child,metaTag);
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    86
-	else
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    87
-		return;
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    88
-  }
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    89
-
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    90
-}
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    91
-
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    92
-void indri::parse::PDFDocumentExtractor::appendPdfMetaData(indri::xml::XMLNode* node) {
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    93
-  indri::xml::XMLNode* current = 0;
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    94
-
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    95
-  if (node == NULL) {
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    96
-    return;
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    97
-  }
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    98
-
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
    99
-  const std::vector<indri::xml::XMLNode*>& children = node->getChildren();
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   100
-
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   101
-  for( size_t i=0; i<children.size(); i++ ) {
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   102
-    indri::xml::XMLNode* child = children[i];
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   103
-    std::string name = child->getName();
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   104
-	if(name=="dccreator")
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   105
-	{
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   106
-		seekValue(child,_author);
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   107
-	}
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   108
-	if(name=="dctitle")
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   109
-	{
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   110
-		seekValue(child,_title);
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   111
-	}
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   112
-	appendPdfMetaData(child);
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   113
-
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   114
-  }
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   115
-
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   116
-
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   117
-
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   118
-}
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   119
-
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   120
-void indri::parse::PDFDocumentExtractor::open( const std::string& filename ) {
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   121
-  _documentTextBuffer.clear();
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   122
-  _documentPath = filename;
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   123
-}
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   124
-
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   125
-void indri::parse::PDFDocumentExtractor::close() {
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   126
-  _documentPath = "";
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   127
-}
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   128
-
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   129
-indri::parse::UnparsedDocument* indri::parse::PDFDocumentExtractor::nextDocument() {
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   130
-  if( !_documentPath.length() )
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   131
-    return 0;
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   132
-
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   133
-  PDFDoc* doc = 0;
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   134
-  TextOutputDev* textOut = 0;
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   135
-  GString* gfilename = new GString(_documentPath.c_str());
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   136
-  doc = new PDFDoc( gfilename );
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   137
-  // if the doc is not ok, or ok to copy, it
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   138
-  // will be a document of length 0.
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   139
-  if( doc->isOk() && doc->okToCopy() ) {
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   140
-    void* stream = &_documentTextBuffer;
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   141
-    textOut = new TextOutputDev( buffer_write, stream, gFalse, gFalse);
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   142
-    if ( textOut->isOk() ) {
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   143
-      int firstPage = 1;
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   144
-      int lastPage = doc->getNumPages();
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   145
-	  double hDPI=72.0;
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   146
-	  double vDPI=72.0;
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   147
-	  int rotate=0;
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   148
-	  GBool useMediaBox=gFalse;
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   149
-	  GBool crop=gTrue; 
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   150
-	  GBool printing=gFalse; 
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   151
-	  if(doc->readMetadata()!=NULL)
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   152
-	  {
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   153
-		  GString rawMetaData = doc->readMetadata();
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   154
-		  GString preparedMetaData="";
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   155
-
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   156
-		  //zoek <rdf:RDF  en eindig bij </rdf:RDF>!! 
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   157
-		  for(int x=0; x<rawMetaData.getLength(); x++) {
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   158
-			  if(rawMetaData.getChar(x)!='?' && rawMetaData.getChar(x)!=':') {
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   159
-				  //skip characters which the XMLReader doesn't understand
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   160
-				  preparedMetaData.append(rawMetaData.getChar(x));
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   161
-			  }
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   162
-		  }
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   163
-		  std::string metaData(preparedMetaData.getCString());
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   164
-		  int startbegin = metaData.find("<rdf");
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   165
-		  int stopend = metaData.find(">", metaData.rfind("</rdf") );
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   166
-		  metaData = metaData.substr(startbegin, (stopend-startbegin)+1 );
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   167
-	  
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   168
-
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   169
-     	  indri::xml::XMLReader reader;
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   170
-
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   171
-		  try {
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   172
-			  std::auto_ptr<indri::xml::XMLNode> result( reader.read( metaData.c_str() ) );
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   173
-			  appendPdfMetaData( result.get() );
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   174
-		  } catch( lemur::api::Exception& e ) {
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   175
-			LEMUR_RETHROW( e, "Had trouble reading PDF metadata" );
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   176
-		  } 
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   177
-		  if( _author.length()>0 || _title.length()>0 )
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   178
-		  {
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   179
-			std::string createdPdfHeader;
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   180
-			createdPdfHeader="<head>\n";
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   181
-			if(_title.length()>0) {
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   182
-				createdPdfHeader+="<title>";
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   183
-				createdPdfHeader+=_title;
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   184
-				createdPdfHeader+="</title>\n";
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   185
-			}
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   186
-			if(_author.length()>0) {
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   187
-				createdPdfHeader+="<author>";
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   188
-				createdPdfHeader+=_author;
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   189
-				createdPdfHeader+="</author>\n";
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   190
-			}
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   191
-			createdPdfHeader+="</head>\n";
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   192
-			char *metastream = _documentTextBuffer.write( createdPdfHeader.length()+1 );
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   193
-			strcpy(metastream, createdPdfHeader.c_str());
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   194
-		  }
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   195
-	  }
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   196
-      doc->displayPages(textOut, firstPage, lastPage, hDPI, vDPI, rotate, useMediaBox, crop, printing);
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   197
-    }
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   198
-  }
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   199
-  
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   200
-
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   201
-  delete textOut;
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   202
-  delete doc;
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   203
-
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   204
-  _unparsedDocument.textLength = _documentTextBuffer.position();
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   205
-  _unparsedDocument.contentLength = _unparsedDocument.textLength ? _documentTextBuffer.position() - 1 : 0 ; // no null 0 if text is empty.
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   206
-  char* docnoPoint = _documentTextBuffer.write( _documentPath.length()+1 );
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   207
-  strcpy( docnoPoint, _documentPath.c_str() );
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   208
-  _unparsedDocument.text = _documentTextBuffer.front();
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   209
-  _unparsedDocument.content = _documentTextBuffer.front();
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   210
-  _unparsedDocument.metadata.clear();
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   211
-
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   212
-  indri::parse::MetadataPair pair;
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   213
-
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   214
-  pair.key = "path";
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   215
-  pair.value = docnoPoint;
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   216
-  pair.valueLength = _documentPath.length()+1;
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   217
-  _unparsedDocument.metadata.push_back( pair );
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   218
-
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   219
-  _docnostring.assign(_documentPath.c_str() );
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   220
-  cleanDocno();
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   221
-  pair.value = _docnostring.c_str();
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   222
-  pair.valueLength = _docnostring.length()+1;
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   223
-  pair.key = "docno";
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   224
-  _unparsedDocument.metadata.push_back( pair );
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   225
- 
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   226
-  _documentPath = "";
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   227
-
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   228
-  return &_unparsedDocument;
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   229
-}
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   230
--- indri-5.4/include/indri/PDFDocumentExtractor.hpp	čt črc  4 15:16:04 2013
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   231
+++ indri-5.4/include/indri/PDFDocumentExtractor.hpp	čt črc  4 15:15:00 2013
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   232
@@ -1,57 +1,0 @@
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   233
-/*==========================================================================
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   234
- * Copyright (c) 2003-2004 University of Massachusetts.  All Rights Reserved.
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   235
- *
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   236
- * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   237
- * is subject to the terms of the software license set forth in the LICENSE
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   238
- * file included with this software, and also available at
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   239
- * http://www.lemurproject.org/license.html
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   240
- *
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   241
- *==========================================================================
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   242
- */
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   243
-
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   244
-
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   245
-//
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   246
-// PDFDocumentExtractor
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   247
-//
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   248
-// 25 June 2004 -- tds
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   249
-//
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   250
-
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   251
-#ifndef INDRI_PDFDOCUMENTEXTRACTOR_HPP
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   252
-#define INDRI_PDFDOCUMENTEXTRACTOR_HPP
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   253
-
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   254
-#include "lemur/lemur-compat.hpp"
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   255
-#include "indri/Buffer.hpp"
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   256
-#include "indri/UnparsedDocument.hpp"
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   257
-#include "indri/DocumentIterator.hpp"
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   258
-#include "indri/XMLReader.hpp"
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   259
-#include "indri/XMLNode.hpp"
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   260
-#include "indri/XMLWriter.hpp"
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   261
-#include <string>
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   262
-namespace indri
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   263
-{
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   264
-  namespace parse
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   265
-  {
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   266
-    
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   267
-    class PDFDocumentExtractor : public DocumentIterator {
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   268
-      indri::utility::Buffer _documentTextBuffer;
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   269
-      UnparsedDocument _unparsedDocument;
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   270
-      std::string _documentPath;
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   271
-  
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   272
-    public:
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   273
-      PDFDocumentExtractor();
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   274
-      ~PDFDocumentExtractor();
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   275
-
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   276
-      void open( const std::string& filename );
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   277
-      UnparsedDocument* nextDocument();
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   278
-	  void appendPdfMetaData(indri::xml::XMLNode* node);
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   279
-	  void seekValue(indri::xml::XMLNode* node, std::string &metaTag);
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   280
-      void close();
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   281
-	private:
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   282
-	  std::string _title;
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   283
-	  std::string _author;
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   284
-
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   285
-    };
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   286
-  }
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   287
-}
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   288
-
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   289
-#endif // INDRI_PDFDOCUMENTEXTRACTOR_HPP
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   290
--- indri-5.4/src/DocumentIteratorFactory.cpp	čt črc  4 15:24:24 2013
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   291
+++ indri-5.4/src/DocumentIteratorFactory.cpp	čt črc  4 15:23:27 2013
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   292
@@ -18,7 +18,6 @@
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   293
 
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   294
 #include "indri/DocumentIteratorFactory.hpp"
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   295
 
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   296
-#include "indri/PDFDocumentExtractor.hpp"
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   297
 #include "indri/TaggedDocumentIterator.hpp"
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   298
 #include "indri/WARCDocumentIterator.hpp"
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   299
 #include "indri/TextDocumentExtractor.hpp"
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   300
@@ -36,7 +35,6 @@
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   301
 
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   302
 #define TYPE_TAGGED   ( "Tagged Document Collection" )
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   303
 #define TYPE_WARC     ( "WARC Document Collection" )
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   304
-#define TYPE_PDF      ( "Adobe PDF" )
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   305
 #define TYPE_WORD     ( "Microsoft Word" )
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   306
 #define TYPE_PPT      ( "Microsoft PowerPoint" )
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   307
 #define TYPE_MBOX     ( "Mailbox" )
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   308
@@ -53,8 +51,6 @@
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   309
     result = iter;
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   310
   } else if( preferred == TYPE_WARC ) {
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   311
     result = new indri::parse::WARCDocumentIterator();
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   312
-  } else if( preferred == TYPE_PDF ) {
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   313
-    result = new indri::parse::PDFDocumentExtractor();
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   314
   } else if( preferred == TYPE_TEXT ) {
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   315
     result = new indri::parse::TextDocumentExtractor();
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   316
   } else if( preferred == TYPE_MBOX ) {
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   317
@@ -83,8 +79,6 @@
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   318
     return TYPE_TAGGED;
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   319
   } else if( type == "warc" || type == TYPE_WARC ) {
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   320
     return TYPE_WARC;
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   321
-  } else if( type == "pdf" || type == "adobe pdf" || type == TYPE_PDF ) {
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   322
-    return TYPE_PDF;
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   323
   } else if( type == "doc" || type == "msword" || type == "word" || type == "microsoft word" || type == TYPE_WORD ) {
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   324
     return TYPE_WORD;
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   325
   } else if( type == "ppt" || type == "powerpoint" || type == "msppt" || type == "microsoft powerpoint" || type == TYPE_PPT ) {
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   326
--- indri-5.4/src/FileClassEnvironmentFactory.cpp	čt črc  4 15:33:56 2013
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   327
+++ indri-5.4/src/FileClassEnvironmentFactory.cpp	čt črc  4 15:33:20 2013
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   328
@@ -55,8 +55,6 @@
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   329
 // case.  Values specified here can be in mixed case, since values are
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   330
 // matched in a case-sensitive manner.
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   331
 
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   332
-static const char* pdf_index_tags[] = { "title", "author", 0 };
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   333
-static const char* pdf_metadata_tags[] = { "title", "author", 0 };
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   334
 static const char* html_index_tags[] = { "title", "author", "h1", "h2", "h3", "h4", 0 };
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   335
 static const char* html_metadata_tags[] = { "title", "author", 0 };
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   336
 //static const char* html_conflations[] = { "h1", NULL, NULL, "heading", "h2", NULL, NULL, "heading", "h3", NULL, NULL, "heading", "h4", NULL, NULL, "heading", "bloghpno", NULL, NULL, "docno", 0, 0, 0, 0 };
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   337
@@ -279,21 +277,6 @@
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   338
 #endif
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   339
 
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   340
   {
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   341
-    "pdf",                // name
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   342
-    "html",               // parser
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   343
-    "word",               // tokenizer
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   344
-    "pdf",                // iterator
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   345
-    NULL,                 // startDocTag
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   346
-    NULL,                 // endDocTag
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   347
-    NULL,                 // endMetadataTag
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   348
-    NULL,                 // includeTags
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   349
-    NULL,                 // excludeTags
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   350
-    pdf_index_tags,       // indexTags
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   351
-    pdf_metadata_tags,    // metadataTags
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   352
-    NULL                  // conflations
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   353
-  },
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   354
-
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   355
-  {
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   356
     "txt",                // name
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   357
     "text",               // parser
8dee2dfe2525 PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff changeset
   358
     "word",               // tokenizer