author | Shawn Walker-Salas <shawn.walker@oracle.com> |
Mon, 23 Nov 2015 15:33:43 -0800 | |
changeset 5125 | 34cc580c62c2 |
parent 1626 | 8dee2dfe2525 |
permissions | -rw-r--r-- |
1626
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
1 |
Remove xpdf support from the build. |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
2 |
--- indri-5.4/MakeDefns.in čt črc 4 15:01:17 2013 |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
3 |
+++ indri-5.4/MakeDefns.in čt črc 4 15:00:40 2013 |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
4 |
@@ -48,7 +48,7 @@ |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
5 |
PHPINCLUDE = @PHPINCLUDE@ |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
6 |
MCS=@MCS@ |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
7 |
|
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
8 |
-DEPENDENCIES = lemur xpdf |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
9 |
+DEPENDENCIES = lemur |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
10 |
ifeq ($(NEED_ANTLR), 1) |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
11 |
DEPENDENCIES += antlr |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
12 |
endif |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
13 |
--- indri-5.4/src/PDFDocumentExtractor.cpp čt črc 4 15:08:46 2013 |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
14 |
+++ indri-5.4/src/PDFDocumentExtractor.cpp čt črc 4 15:08:28 2013 |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
15 |
@@ -1,214 +1,0 @@ |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
16 |
-/*========================================================================== |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
17 |
- * Copyright (c) 2003-2004 University of Massachusetts. All Rights Reserved. |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
18 |
- * |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
19 |
- * Use of the Lemur Toolkit for Language Modeling and Information Retrieval |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
20 |
- * is subject to the terms of the software license set forth in the LICENSE |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
21 |
- * file included with this software, and also available at |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
22 |
- * http://www.lemurproject.org/license.html |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
23 |
- * |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
24 |
- *========================================================================== |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
25 |
-*/ |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
26 |
- |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
27 |
- |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
28 |
-// |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
29 |
-// PDFDocumentExtractor |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
30 |
-// |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
31 |
-// 25 June 2004 -- tds |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
32 |
-// |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
33 |
- |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
34 |
-#include "indri/PDFDocumentExtractor.hpp" |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
35 |
-#include "indri/Buffer.hpp" |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
36 |
- |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
37 |
-#include "GString.h" |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
38 |
-#include "TextOutputDev.h" |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
39 |
-#include "PDFDoc.h" |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
40 |
- |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
41 |
-#include "Object.h" |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
42 |
-#include "Stream.h" |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
43 |
-#include "Array.h" |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
44 |
-#include "Dict.h" |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
45 |
-#include "XRef.h" |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
46 |
-#include "Page.h" |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
47 |
-#include "CharTypes.h" |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
48 |
-#include "GlobalParams.h" |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
49 |
-#include "lemur/Exception.hpp" |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
50 |
- |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
51 |
-static void buffer_write( void* stream, char* text, int len ) { |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
52 |
- indri::utility::Buffer* buffer = (indri::utility::Buffer*) stream; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
53 |
- |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
54 |
- if( buffer->position() ) { |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
55 |
- buffer->unwrite(1); |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
56 |
- } |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
57 |
- |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
58 |
- memcpy( buffer->write(len), text, len ); |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
59 |
- if( text[len-1] != 0 ) |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
60 |
- *buffer->write(1) = 0; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
61 |
-} |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
62 |
- |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
63 |
-indri::parse::PDFDocumentExtractor::PDFDocumentExtractor() { |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
64 |
- globalParams = new GlobalParams(0); |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
65 |
- _title=""; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
66 |
- _author=""; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
67 |
-} |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
68 |
- |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
69 |
-indri::parse::PDFDocumentExtractor::~PDFDocumentExtractor() { |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
70 |
- delete globalParams; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
71 |
- globalParams = 0; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
72 |
-} |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
73 |
- |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
74 |
- |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
75 |
-void indri::parse::PDFDocumentExtractor::seekValue(indri::xml::XMLNode* node, std::string &metaTag) { |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
76 |
- if (node == NULL) { |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
77 |
- return; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
78 |
- } |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
79 |
- |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
80 |
- const std::vector<indri::xml::XMLNode*>& children = node->getChildren(); |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
81 |
- for( size_t i=0; i<children.size(); i++ ) { |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
82 |
- indri::xml::XMLNode* child = children[i]; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
83 |
- metaTag = child->getValue(); |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
84 |
- if(metaTag.length()==0) |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
85 |
- seekValue(child,metaTag); |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
86 |
- else |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
87 |
- return; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
88 |
- } |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
89 |
- |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
90 |
-} |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
91 |
- |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
92 |
-void indri::parse::PDFDocumentExtractor::appendPdfMetaData(indri::xml::XMLNode* node) { |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
93 |
- indri::xml::XMLNode* current = 0; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
94 |
- |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
95 |
- if (node == NULL) { |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
96 |
- return; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
97 |
- } |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
98 |
- |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
99 |
- const std::vector<indri::xml::XMLNode*>& children = node->getChildren(); |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
100 |
- |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
101 |
- for( size_t i=0; i<children.size(); i++ ) { |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
102 |
- indri::xml::XMLNode* child = children[i]; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
103 |
- std::string name = child->getName(); |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
104 |
- if(name=="dccreator") |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
105 |
- { |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
106 |
- seekValue(child,_author); |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
107 |
- } |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
108 |
- if(name=="dctitle") |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
109 |
- { |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
110 |
- seekValue(child,_title); |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
111 |
- } |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
112 |
- appendPdfMetaData(child); |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
113 |
- |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
114 |
- } |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
115 |
- |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
116 |
- |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
117 |
- |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
118 |
-} |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
119 |
- |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
120 |
-void indri::parse::PDFDocumentExtractor::open( const std::string& filename ) { |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
121 |
- _documentTextBuffer.clear(); |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
122 |
- _documentPath = filename; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
123 |
-} |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
124 |
- |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
125 |
-void indri::parse::PDFDocumentExtractor::close() { |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
126 |
- _documentPath = ""; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
127 |
-} |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
128 |
- |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
129 |
-indri::parse::UnparsedDocument* indri::parse::PDFDocumentExtractor::nextDocument() { |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
130 |
- if( !_documentPath.length() ) |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
131 |
- return 0; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
132 |
- |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
133 |
- PDFDoc* doc = 0; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
134 |
- TextOutputDev* textOut = 0; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
135 |
- GString* gfilename = new GString(_documentPath.c_str()); |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
136 |
- doc = new PDFDoc( gfilename ); |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
137 |
- // if the doc is not ok, or ok to copy, it |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
138 |
- // will be a document of length 0. |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
139 |
- if( doc->isOk() && doc->okToCopy() ) { |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
140 |
- void* stream = &_documentTextBuffer; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
141 |
- textOut = new TextOutputDev( buffer_write, stream, gFalse, gFalse); |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
142 |
- if ( textOut->isOk() ) { |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
143 |
- int firstPage = 1; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
144 |
- int lastPage = doc->getNumPages(); |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
145 |
- double hDPI=72.0; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
146 |
- double vDPI=72.0; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
147 |
- int rotate=0; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
148 |
- GBool useMediaBox=gFalse; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
149 |
- GBool crop=gTrue; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
150 |
- GBool printing=gFalse; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
151 |
- if(doc->readMetadata()!=NULL) |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
152 |
- { |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
153 |
- GString rawMetaData = doc->readMetadata(); |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
154 |
- GString preparedMetaData=""; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
155 |
- |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
156 |
- //zoek <rdf:RDF en eindig bij </rdf:RDF>!! |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
157 |
- for(int x=0; x<rawMetaData.getLength(); x++) { |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
158 |
- if(rawMetaData.getChar(x)!='?' && rawMetaData.getChar(x)!=':') { |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
159 |
- //skip characters which the XMLReader doesn't understand |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
160 |
- preparedMetaData.append(rawMetaData.getChar(x)); |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
161 |
- } |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
162 |
- } |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
163 |
- std::string metaData(preparedMetaData.getCString()); |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
164 |
- int startbegin = metaData.find("<rdf"); |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
165 |
- int stopend = metaData.find(">", metaData.rfind("</rdf") ); |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
166 |
- metaData = metaData.substr(startbegin, (stopend-startbegin)+1 ); |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
167 |
- |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
168 |
- |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
169 |
- indri::xml::XMLReader reader; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
170 |
- |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
171 |
- try { |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
172 |
- std::auto_ptr<indri::xml::XMLNode> result( reader.read( metaData.c_str() ) ); |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
173 |
- appendPdfMetaData( result.get() ); |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
174 |
- } catch( lemur::api::Exception& e ) { |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
175 |
- LEMUR_RETHROW( e, "Had trouble reading PDF metadata" ); |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
176 |
- } |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
177 |
- if( _author.length()>0 || _title.length()>0 ) |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
178 |
- { |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
179 |
- std::string createdPdfHeader; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
180 |
- createdPdfHeader="<head>\n"; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
181 |
- if(_title.length()>0) { |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
182 |
- createdPdfHeader+="<title>"; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
183 |
- createdPdfHeader+=_title; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
184 |
- createdPdfHeader+="</title>\n"; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
185 |
- } |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
186 |
- if(_author.length()>0) { |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
187 |
- createdPdfHeader+="<author>"; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
188 |
- createdPdfHeader+=_author; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
189 |
- createdPdfHeader+="</author>\n"; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
190 |
- } |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
191 |
- createdPdfHeader+="</head>\n"; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
192 |
- char *metastream = _documentTextBuffer.write( createdPdfHeader.length()+1 ); |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
193 |
- strcpy(metastream, createdPdfHeader.c_str()); |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
194 |
- } |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
195 |
- } |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
196 |
- doc->displayPages(textOut, firstPage, lastPage, hDPI, vDPI, rotate, useMediaBox, crop, printing); |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
197 |
- } |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
198 |
- } |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
199 |
- |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
200 |
- |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
201 |
- delete textOut; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
202 |
- delete doc; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
203 |
- |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
204 |
- _unparsedDocument.textLength = _documentTextBuffer.position(); |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
205 |
- _unparsedDocument.contentLength = _unparsedDocument.textLength ? _documentTextBuffer.position() - 1 : 0 ; // no null 0 if text is empty. |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
206 |
- char* docnoPoint = _documentTextBuffer.write( _documentPath.length()+1 ); |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
207 |
- strcpy( docnoPoint, _documentPath.c_str() ); |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
208 |
- _unparsedDocument.text = _documentTextBuffer.front(); |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
209 |
- _unparsedDocument.content = _documentTextBuffer.front(); |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
210 |
- _unparsedDocument.metadata.clear(); |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
211 |
- |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
212 |
- indri::parse::MetadataPair pair; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
213 |
- |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
214 |
- pair.key = "path"; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
215 |
- pair.value = docnoPoint; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
216 |
- pair.valueLength = _documentPath.length()+1; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
217 |
- _unparsedDocument.metadata.push_back( pair ); |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
218 |
- |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
219 |
- _docnostring.assign(_documentPath.c_str() ); |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
220 |
- cleanDocno(); |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
221 |
- pair.value = _docnostring.c_str(); |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
222 |
- pair.valueLength = _docnostring.length()+1; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
223 |
- pair.key = "docno"; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
224 |
- _unparsedDocument.metadata.push_back( pair ); |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
225 |
- |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
226 |
- _documentPath = ""; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
227 |
- |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
228 |
- return &_unparsedDocument; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
229 |
-} |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
230 |
--- indri-5.4/include/indri/PDFDocumentExtractor.hpp čt črc 4 15:16:04 2013 |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
231 |
+++ indri-5.4/include/indri/PDFDocumentExtractor.hpp čt črc 4 15:15:00 2013 |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
232 |
@@ -1,57 +1,0 @@ |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
233 |
-/*========================================================================== |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
234 |
- * Copyright (c) 2003-2004 University of Massachusetts. All Rights Reserved. |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
235 |
- * |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
236 |
- * Use of the Lemur Toolkit for Language Modeling and Information Retrieval |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
237 |
- * is subject to the terms of the software license set forth in the LICENSE |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
238 |
- * file included with this software, and also available at |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
239 |
- * http://www.lemurproject.org/license.html |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
240 |
- * |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
241 |
- *========================================================================== |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
242 |
- */ |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
243 |
- |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
244 |
- |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
245 |
-// |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
246 |
-// PDFDocumentExtractor |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
247 |
-// |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
248 |
-// 25 June 2004 -- tds |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
249 |
-// |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
250 |
- |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
251 |
-#ifndef INDRI_PDFDOCUMENTEXTRACTOR_HPP |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
252 |
-#define INDRI_PDFDOCUMENTEXTRACTOR_HPP |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
253 |
- |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
254 |
-#include "lemur/lemur-compat.hpp" |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
255 |
-#include "indri/Buffer.hpp" |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
256 |
-#include "indri/UnparsedDocument.hpp" |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
257 |
-#include "indri/DocumentIterator.hpp" |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
258 |
-#include "indri/XMLReader.hpp" |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
259 |
-#include "indri/XMLNode.hpp" |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
260 |
-#include "indri/XMLWriter.hpp" |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
261 |
-#include <string> |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
262 |
-namespace indri |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
263 |
-{ |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
264 |
- namespace parse |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
265 |
- { |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
266 |
- |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
267 |
- class PDFDocumentExtractor : public DocumentIterator { |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
268 |
- indri::utility::Buffer _documentTextBuffer; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
269 |
- UnparsedDocument _unparsedDocument; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
270 |
- std::string _documentPath; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
271 |
- |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
272 |
- public: |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
273 |
- PDFDocumentExtractor(); |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
274 |
- ~PDFDocumentExtractor(); |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
275 |
- |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
276 |
- void open( const std::string& filename ); |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
277 |
- UnparsedDocument* nextDocument(); |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
278 |
- void appendPdfMetaData(indri::xml::XMLNode* node); |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
279 |
- void seekValue(indri::xml::XMLNode* node, std::string &metaTag); |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
280 |
- void close(); |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
281 |
- private: |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
282 |
- std::string _title; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
283 |
- std::string _author; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
284 |
- |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
285 |
- }; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
286 |
- } |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
287 |
-} |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
288 |
- |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
289 |
-#endif // INDRI_PDFDOCUMENTEXTRACTOR_HPP |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
290 |
--- indri-5.4/src/DocumentIteratorFactory.cpp čt črc 4 15:24:24 2013 |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
291 |
+++ indri-5.4/src/DocumentIteratorFactory.cpp čt črc 4 15:23:27 2013 |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
292 |
@@ -18,7 +18,6 @@ |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
293 |
|
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
294 |
#include "indri/DocumentIteratorFactory.hpp" |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
295 |
|
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
296 |
-#include "indri/PDFDocumentExtractor.hpp" |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
297 |
#include "indri/TaggedDocumentIterator.hpp" |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
298 |
#include "indri/WARCDocumentIterator.hpp" |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
299 |
#include "indri/TextDocumentExtractor.hpp" |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
300 |
@@ -36,7 +35,6 @@ |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
301 |
|
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
302 |
#define TYPE_TAGGED ( "Tagged Document Collection" ) |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
303 |
#define TYPE_WARC ( "WARC Document Collection" ) |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
304 |
-#define TYPE_PDF ( "Adobe PDF" ) |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
305 |
#define TYPE_WORD ( "Microsoft Word" ) |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
306 |
#define TYPE_PPT ( "Microsoft PowerPoint" ) |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
307 |
#define TYPE_MBOX ( "Mailbox" ) |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
308 |
@@ -53,8 +51,6 @@ |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
309 |
result = iter; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
310 |
} else if( preferred == TYPE_WARC ) { |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
311 |
result = new indri::parse::WARCDocumentIterator(); |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
312 |
- } else if( preferred == TYPE_PDF ) { |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
313 |
- result = new indri::parse::PDFDocumentExtractor(); |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
314 |
} else if( preferred == TYPE_TEXT ) { |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
315 |
result = new indri::parse::TextDocumentExtractor(); |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
316 |
} else if( preferred == TYPE_MBOX ) { |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
317 |
@@ -83,8 +79,6 @@ |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
318 |
return TYPE_TAGGED; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
319 |
} else if( type == "warc" || type == TYPE_WARC ) { |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
320 |
return TYPE_WARC; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
321 |
- } else if( type == "pdf" || type == "adobe pdf" || type == TYPE_PDF ) { |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
322 |
- return TYPE_PDF; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
323 |
} else if( type == "doc" || type == "msword" || type == "word" || type == "microsoft word" || type == TYPE_WORD ) { |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
324 |
return TYPE_WORD; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
325 |
} else if( type == "ppt" || type == "powerpoint" || type == "msppt" || type == "microsoft powerpoint" || type == TYPE_PPT ) { |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
326 |
--- indri-5.4/src/FileClassEnvironmentFactory.cpp čt črc 4 15:33:56 2013 |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
327 |
+++ indri-5.4/src/FileClassEnvironmentFactory.cpp čt črc 4 15:33:20 2013 |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
328 |
@@ -55,8 +55,6 @@ |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
329 |
// case. Values specified here can be in mixed case, since values are |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
330 |
// matched in a case-sensitive manner. |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
331 |
|
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
332 |
-static const char* pdf_index_tags[] = { "title", "author", 0 }; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
333 |
-static const char* pdf_metadata_tags[] = { "title", "author", 0 }; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
334 |
static const char* html_index_tags[] = { "title", "author", "h1", "h2", "h3", "h4", 0 }; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
335 |
static const char* html_metadata_tags[] = { "title", "author", 0 }; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
336 |
//static const char* html_conflations[] = { "h1", NULL, NULL, "heading", "h2", NULL, NULL, "heading", "h3", NULL, NULL, "heading", "h4", NULL, NULL, "heading", "bloghpno", NULL, NULL, "docno", 0, 0, 0, 0 }; |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
337 |
@@ -279,21 +277,6 @@ |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
338 |
#endif |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
339 |
|
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
340 |
{ |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
341 |
- "pdf", // name |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
342 |
- "html", // parser |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
343 |
- "word", // tokenizer |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
344 |
- "pdf", // iterator |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
345 |
- NULL, // startDocTag |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
346 |
- NULL, // endDocTag |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
347 |
- NULL, // endMetadataTag |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
348 |
- NULL, // includeTags |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
349 |
- NULL, // excludeTags |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
350 |
- pdf_index_tags, // indexTags |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
351 |
- pdf_metadata_tags, // metadataTags |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
352 |
- NULL // conflations |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
353 |
- }, |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
354 |
- |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
355 |
- { |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
356 |
"txt", // name |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
357 |
"text", // parser |
8dee2dfe2525
PSARC/2013/232 Indri
Vladimir Marek <Vladimir.Marek@oracle.com>
parents:
diff
changeset
|
358 |
"word", // tokenizer |