|
1 Add our PIA wrapper to indri sources. This patch does several things: |
|
2 - Add pia wrapper sources to indri source tree |
|
3 - Add new tokenizer which does not treat '_' as a separator |
|
4 - The TextTokenizerPIA.l differs from TextTokenizer.l only in single character |
|
5 -[a-zA-Z0-9']+ { byte_position += tokleng; return ASCII_TOKEN; } |
|
6 +[a-zA-Z0-9_']+ { byte_position += tokleng; return ASCII_TOKEN; } |
|
7 - plus many symbol renames so that the parsers can coexist (toktext -> piatoktext etc.) |
|
8 - TextTokenizerPIA.hpp contains only symbol renamse |
|
9 - Rest are modifications to make indri build PIA wrapper |
|
10 |
|
11 |
|
12 --- indri-5.4/pia_wrapper.cpp po črc 15 14:30:41 2013 |
|
13 +++ indri-5.4/pia_wrapper.cpp po črc 15 14:29:09 2013 |
|
14 @@ -0,0 +1,222 @@ |
|
15 +/* |
|
16 + * TO compile : |
|
17 + * g++ -o libpia_wrapper.so -shared -fPIC -I../vlad-libs/sparc/usr/include/ -L../vlad-libs/sparc/usr/lib/ -lclucene-core -lnvpair pia_wrapper.cc |
|
18 + * |
|
19 + */ |
|
20 + |
|
21 +#include <sys/stat.h> |
|
22 +#include <strings.h> |
|
23 +#include <stdio.h> |
|
24 +#include <libnvpair.h> |
|
25 + |
|
26 +#include <iostream> |
|
27 +#include <string> |
|
28 +#include <sstream> |
|
29 +#include <fstream> |
|
30 + |
|
31 +#include <vector> |
|
32 +#include "indri/QueryEnvironment.hpp" |
|
33 +#include "indri/SnippetBuilder.hpp" |
|
34 +#include "indri/Repository.hpp" |
|
35 + |
|
36 +using namespace std; |
|
37 + |
|
38 +using namespace indri::api; |
|
39 + |
|
40 +#define MAX_RESULTS 3 |
|
41 +#define PIA_DATABASE "/var/db/piadb" |
|
42 +#define PIA_DATABASE_STORAGE PIA_DATABASE "/collection/storage" |
|
43 + |
|
44 +indri::collection::Repository repository; |
|
45 + |
|
46 +std::string |
|
47 +getFieldText(int documentID, std::string field) { |
|
48 + std::string ret_val = ""; |
|
49 + indri::collection::Repository::index_state repIndexState = repository.indexes(); |
|
50 + indri::index::Index *thisIndex=(*repIndexState)[0]; |
|
51 + int fieldID=thisIndex->field(field); |
|
52 + |
|
53 + if (fieldID < 1) { |
|
54 + return ""; |
|
55 + } |
|
56 + |
|
57 + const indri::index::TermList *termList=thisIndex->termList(documentID); |
|
58 + |
|
59 + if (!termList) { |
|
60 + return ""; |
|
61 + } |
|
62 + |
|
63 + indri::utility::greedy_vector< indri::index::FieldExtent > fieldVec=termList->fields(); |
|
64 + indri::utility::greedy_vector< indri::index::FieldExtent >::iterator fIter=fieldVec.begin(); |
|
65 + while (fIter!=fieldVec.end()) { |
|
66 + |
|
67 + if ((*fIter).id==fieldID) { |
|
68 + int beginTerm=(*fIter).begin; |
|
69 + int endTerm=(*fIter).end; |
|
70 + |
|
71 + /* |
|
72 + * note that the text is inclusive of the beginning |
|
73 + * but exclusive of the ending |
|
74 + */ |
|
75 + for (int t=beginTerm; t < endTerm; t++) { |
|
76 + int thisTermID=termList->terms()[t]; |
|
77 + ret_val = ret_val + thisIndex->term(thisTermID) + " "; |
|
78 + } |
|
79 + } |
|
80 + |
|
81 + fIter++; |
|
82 + } |
|
83 + |
|
84 + delete termList; |
|
85 + termList=NULL; |
|
86 + return ret_val; |
|
87 +} |
|
88 + |
|
89 +/* |
|
90 + * Returns NULL on failure |
|
91 + * nvlist * |
|
92 + * search( |
|
93 + * nvlist_t *search_params, |
|
94 + * char **errmsg // Similar to pia_index() |
|
95 + * ); |
|
96 + */ |
|
97 +nvlist * |
|
98 +search (nvlist_t *search_params, char **errmsg) { |
|
99 + |
|
100 + char *index_path = PIA_DATABASE; |
|
101 + nvlist_t **nvl_list_result; |
|
102 + nvlist_t *nvl_return; |
|
103 + nvlist_t *nvl_result; |
|
104 + nvlist_t *results = NULL; |
|
105 + |
|
106 + if (nvlist_alloc(&results, NV_UNIQUE_NAME, 0) != 0) { |
|
107 + *errmsg = strdup("nvlist_alloc failed\n"); |
|
108 + return NULL; |
|
109 + } |
|
110 + |
|
111 + try { |
|
112 + std::string query; |
|
113 + char *panicstack; |
|
114 + (void) nvlist_lookup_string(search_params, "stack", &panicstack); |
|
115 + |
|
116 + QueryEnvironment indriEnvironment; |
|
117 + indriEnvironment.addIndex(index_path); |
|
118 + |
|
119 + /* Create Indri query */ |
|
120 + query = "#combine (" + std::string(panicstack) + ")"; |
|
121 + |
|
122 + QueryAnnotation *QAresults=indriEnvironment.runAnnotatedQuery(query.c_str(), MAX_RESULTS); |
|
123 + |
|
124 + std::vector<indri::api::ScoredExtentResult> resultVector=QAresults->getResults(); |
|
125 + |
|
126 + int totalNumResults=resultVector.size(); |
|
127 + |
|
128 + /* Get Parsed document of the results */ |
|
129 + std::vector<ParsedDocument*> parsedDocs=indriEnvironment.documents(resultVector); |
|
130 + |
|
131 + int results_to_return = 0; |
|
132 + for ( size_t i=0; i < totalNumResults && i < MAX_RESULTS; i++ ) { |
|
133 + results_to_return++; |
|
134 + } |
|
135 + |
|
136 + /* Open Repository */ |
|
137 + repository.openRead(index_path); |
|
138 + |
|
139 + nvl_list_result = (nvlist_t **) malloc(results_to_return * sizeof(nvlist_t *)); |
|
140 + |
|
141 + for ( size_t i=0; i < results_to_return; i++ ) { |
|
142 + |
|
143 + std::string ret=""; |
|
144 + |
|
145 + int thisResultDocID=resultVector[i].document; |
|
146 + |
|
147 + if (nvlist_alloc(&nvl_list_result[i], NV_UNIQUE_NAME, 0) != 0) { |
|
148 + *errmsg = strdup("nvlist_alloc failed\n"); |
|
149 + return NULL; |
|
150 + } |
|
151 + |
|
152 + if ((ret = getFieldText(thisResultDocID, "bug")) == "") { |
|
153 + *errmsg = strdup("Lookup of bugid failed\n"); |
|
154 + return NULL; |
|
155 + } else if (nvlist_add_string(nvl_list_result[i], "pia-bugid", ret.c_str())) { |
|
156 + *errmsg = strdup("nvlist_add bugid failed\n"); |
|
157 + return NULL; |
|
158 + } |
|
159 + |
|
160 + if ((ret = getFieldText(thisResultDocID, "stack")) == "") { |
|
161 + *errmsg = strdup("Lookup of stack failed\n"); |
|
162 + return NULL; |
|
163 + } else if (nvlist_add_string(nvl_list_result[i], "pia-stack", ret.c_str())) { |
|
164 + *errmsg = strdup("nvlist_add stack failed\n"); |
|
165 + return NULL; |
|
166 + } |
|
167 + |
|
168 + if ((ret = getFieldText(thisResultDocID, "signature")) == "") { |
|
169 + *errmsg = strdup("Lookup of signature failed\n"); |
|
170 + return NULL; |
|
171 + } else if (nvlist_add_string(nvl_list_result[i], "pia-signature", ret.c_str())) { |
|
172 + *errmsg = strdup("nvlist_add signature failed\n"); |
|
173 + return NULL; |
|
174 + } |
|
175 + |
|
176 + int indri_score = 1000 + (int)resultVector[i].score*1000; |
|
177 + if (nvlist_add_int32(nvl_list_result[i], "pia-score", indri_score)) { |
|
178 + *errmsg = strdup("nvlist_add score failed\n"); |
|
179 + return NULL; |
|
180 + } |
|
181 + } |
|
182 + repository.close(); |
|
183 + |
|
184 + nvlist_add_nvlist_array(results, "results", nvl_list_result, results_to_return); |
|
185 + |
|
186 + for (int i=0; i<results_to_return; i++) { |
|
187 + nvlist_free(nvl_list_result[i]); |
|
188 + } |
|
189 + |
|
190 + return results; |
|
191 + |
|
192 + } catch(...){ |
|
193 + nvl_list_result = (nvlist_t **) malloc(1 * sizeof(nvlist_t **)); |
|
194 + |
|
195 + if (nvlist_alloc(&nvl_result, NV_UNIQUE_NAME, 0) != 0) { |
|
196 + *errmsg = strdup("nvlist_alloc failed\n"); |
|
197 + return NULL; |
|
198 + } |
|
199 + |
|
200 + if (nvlist_add_string(nvl_result, "error", "Indri Error")) { |
|
201 + *errmsg = strdup("nvlist_add error failed\n"); |
|
202 + return NULL; |
|
203 + } |
|
204 + |
|
205 + nvlist_dup(nvl_result, &nvl_list_result[0], 0); |
|
206 + nvlist_free(nvl_result); |
|
207 + nvlist_add_nvlist_array(results, "results", nvl_list_result, 1); |
|
208 + |
|
209 + return results; |
|
210 + } |
|
211 +} |
|
212 + |
|
213 +extern "C" nvlist* |
|
214 +pia_search (nvlist_t *search_params, char **errmsg) { |
|
215 + |
|
216 + return search (search_params, errmsg); |
|
217 + |
|
218 +} |
|
219 + |
|
220 +int |
|
221 +init () { |
|
222 + |
|
223 + struct stat sb; |
|
224 + if (stat(PIA_DATABASE_STORAGE, &sb) != 0) { |
|
225 + return 1; |
|
226 + } |
|
227 + |
|
228 + return 0; |
|
229 +} |
|
230 + |
|
231 +extern "C" int |
|
232 +pia_init () { |
|
233 + |
|
234 + return init (); |
|
235 + |
|
236 +} |
|
237 --- indri-5.4/src/TextTokenizerPIA.l po črc 15 14:38:12 2013 |
|
238 +++ indri-5.4/src/TextTokenizerPIA.l po črc 15 14:36:55 2013 |
|
239 @@ -0,0 +1,588 @@ |
|
240 +%option noyywrap |
|
241 +%option never-interactive |
|
242 +%option prefix="piatok" |
|
243 + |
|
244 +%{ |
|
245 + |
|
246 +/*========================================================================== |
|
247 + * Copyright (c) 2004 University of Massachusetts. All Rights Reserved. |
|
248 + * |
|
249 + * Use of the Lemur Toolkit for Language Modeling and Information Retrieval |
|
250 + * is subject to the terms of the software license set forth in the LICENSE |
|
251 + * file included with this software, and also available at |
|
252 + * http://www.lemurproject.org/license.html |
|
253 + * |
|
254 + *========================================================================== |
|
255 + */ |
|
256 + |
|
257 +// |
|
258 +// TextTokenizerPIA |
|
259 +// |
|
260 +// 15 September 2005 -- mwb |
|
261 +// |
|
262 + |
|
263 +#include <string.h> |
|
264 +#include <ctype.h> |
|
265 +#include "indri/TextTokenizerPIA.hpp" |
|
266 +#include "indri/TermExtent.hpp" |
|
267 +#include "indri/TagEvent.hpp" |
|
268 +#include "indri/TokenizedDocument.hpp" |
|
269 +#include "indri/UnparsedDocument.hpp" |
|
270 +#include "indri/UTF8Transcoder.hpp" |
|
271 +#include "indri/AttributeValuePair.hpp" |
|
272 + |
|
273 +static long byte_position; |
|
274 + |
|
275 +#define ZAP 1 |
|
276 +#define TAG 2 |
|
277 +#define ASCII_TOKEN 3 |
|
278 +#define UTF8_TOKEN 4 |
|
279 + |
|
280 +%} |
|
281 +%start COMMENT |
|
282 +%% |
|
283 + |
|
284 +"<!--" { BEGIN(COMMENT); byte_position += piatokleng; return ZAP; } |
|
285 +<COMMENT>[^-]+ { byte_position += piatokleng; return ZAP; } |
|
286 +<COMMENT>"-->" { BEGIN(INITIAL); byte_position += piatokleng; return ZAP; } |
|
287 +<COMMENT>"-" { byte_position += piatokleng; return ZAP; } |
|
288 +"<!"[^\>]*">" { byte_position += piatokleng; return ZAP; } |
|
289 +\<[a-zA-Z/][^\>]*\> { byte_position += piatokleng; return TAG; } |
|
290 +[&]([a-zA-Z]+|[#]([0-9]+|[xX][a-fA-F0-9]+))[;] { byte_position += piatokleng; return ZAP; /* symbols */ } |
|
291 +[A-Z0-9]"."([A-Z0-9]".")* { byte_position += piatokleng; return ASCII_TOKEN; } |
|
292 +[a-zA-Z0-9_']+ { byte_position += piatokleng; return ASCII_TOKEN; } |
|
293 +"-"[0-9]+("."[0-9]+)? { byte_position += piatokleng; return ASCII_TOKEN; } |
|
294 +[a-zA-Z0-9\x80-\xFD]+ { byte_position += piatokleng; return UTF8_TOKEN; } |
|
295 + |
|
296 +[\n] { byte_position += piatokleng; return ZAP; } |
|
297 +. { byte_position += piatokleng; return ZAP; } |
|
298 + |
|
299 +%% |
|
300 + |
|
301 +indri::parse::TokenizedDocument* indri::parse::TextTokenizerPIA::tokenize( indri::parse::UnparsedDocument* document ) { |
|
302 + |
|
303 + _termBuffer.clear(); |
|
304 + if ( _tokenize_entire_words) |
|
305 + _termBuffer.grow( document->textLength * 4); |
|
306 + else |
|
307 + _termBuffer.grow( document->textLength * 8 ); // extra null per char. |
|
308 + |
|
309 + _document.terms.clear(); |
|
310 + _document.tags.clear(); |
|
311 + _document.positions.clear(); |
|
312 + |
|
313 + _document.metadata = document->metadata; |
|
314 + _document.text = document->text; |
|
315 + _document.textLength = document->textLength; |
|
316 + _document.content = document->content; |
|
317 + _document.contentLength = document->contentLength; |
|
318 + |
|
319 + // byte offset |
|
320 + byte_position = document->content - document->text; |
|
321 + |
|
322 + piatok_scan_bytes( document->content, document->contentLength ); |
|
323 + |
|
324 + // Main Tokenizer loop |
|
325 + |
|
326 + int type; |
|
327 + |
|
328 + while ( type = piatoklex() ) { |
|
329 + |
|
330 + switch ( type ) { |
|
331 + |
|
332 + case ASCII_TOKEN: processASCIIToken(); break; |
|
333 + |
|
334 + case UTF8_TOKEN: processUTF8Token(); break; |
|
335 + |
|
336 + case TAG: if ( _tokenize_markup ) processTag(); break; |
|
337 + |
|
338 + default: |
|
339 + case ZAP: |
|
340 + break; |
|
341 + |
|
342 + } |
|
343 + |
|
344 + } |
|
345 + |
|
346 + piatok_delete_buffer( YY_CURRENT_BUFFER ); |
|
347 + |
|
348 + return &_document; |
|
349 +} |
|
350 + |
|
351 +// Member functions for processing tokenization events as dispatched |
|
352 +// from the main tokenizer loop |
|
353 + |
|
354 +void indri::parse::TextTokenizerPIA::processTag() { |
|
355 + |
|
356 + // Here, we parse the tag in a fashion that is relatively robust to |
|
357 + // malformed markup. toktext matches this pattern: <[^>]+> |
|
358 + |
|
359 + if ( piatoktext[1] == '?' || piatoktext[1] == '!' ) { |
|
360 + |
|
361 + // XML declaration like <? ... ?> and <!DOCTYPE ... > |
|
362 + return; // ignore |
|
363 + |
|
364 + } else if ( piatoktext[1] == '/' ) { // close tag, eg. </FOO> |
|
365 + |
|
366 + // Downcase the tag name. |
|
367 + |
|
368 + int len = 0; |
|
369 + |
|
370 + for ( char *c = piatoktext + 2; |
|
371 +#ifndef WIN32 |
|
372 + isalnum( *c ) || *c == '-' || *c == '_' || *c == ':' ; c++ ) { |
|
373 +#else |
|
374 + ((*c >= 0) && isalnum( *c )) || *c == '-' || *c == '_' || *c == ':' ; c++ ) { |
|
375 +#endif |
|
376 + |
|
377 + *c = tolower( *c ); |
|
378 + if ( *c == ':' ) *c = '_'; /* replace colon (from namespaces) */ |
|
379 + len++; |
|
380 + } |
|
381 + |
|
382 + TagEvent te; |
|
383 + |
|
384 + te.open_tag = false; |
|
385 + |
|
386 + // We need to write len characters, plus a NULL |
|
387 + char* write_loc = _termBuffer.write( len + 1 ); |
|
388 + strncpy( write_loc, piatoktext + 2, len ); |
|
389 + write_loc[len] = '\0'; |
|
390 + te.name = write_loc; |
|
391 + |
|
392 + // token position of tag event w/r/t token string |
|
393 + te.pos = _document.terms.size(); |
|
394 + |
|
395 + te.begin = byte_position - piatokleng; |
|
396 + te.end = byte_position; |
|
397 + |
|
398 + _document.tags.push_back( te ); |
|
399 + |
|
400 +#ifndef WIN32 |
|
401 + } else if ( isalpha( piatoktext[1] ) ) { |
|
402 +#else |
|
403 + } else if ( (piatoktext[1] >= 0) && (isalpha( piatoktext[1] ) )) { |
|
404 +#endif |
|
405 + |
|
406 + // Try to extract the tag name: |
|
407 + |
|
408 + char* c = piatoktext + 1; |
|
409 + int i = 0; |
|
410 + int offset = 1; // current offset w/r/t byte_position - piatokleng |
|
411 + // it starts at one because it is incremented when c is, and c starts at one. |
|
412 + char* write_loc; |
|
413 + |
|
414 +#ifndef WIN32 |
|
415 + while ( isalnum( c[i] ) || c[i] == '-' || c[i] == '_' || c[i] == ':' ) i++; |
|
416 +#else |
|
417 + while ( ( (c[i] >= 0) && isalnum( c[i] )) || c[i] == '-' || c[i] == '_' || c[i] == ':' ) i++; |
|
418 +#endif |
|
419 + if ( c[i] == '>' ) { |
|
420 + |
|
421 + // open tag with no attributes, eg. <title> |
|
422 + |
|
423 + // Ensure tag name is downcased |
|
424 + for ( int j = 0; j < i; j++ ) { |
|
425 + c[j] = tolower( c[j] ); |
|
426 + if ( c[j] == ':' ) c[j] = '_'; /* replace colon (from namespaces) */ |
|
427 + } |
|
428 + |
|
429 + TagEvent te; |
|
430 + |
|
431 + te.open_tag = true; |
|
432 + |
|
433 + // need to write i characters, plus a NULL |
|
434 + char* write_loc = _termBuffer.write( i + 1 ); |
|
435 + strncpy( write_loc, c, i ); |
|
436 + write_loc[i] = '\0'; |
|
437 + te.name = write_loc; |
|
438 + |
|
439 + te.pos = _document.terms.size(); |
|
440 + |
|
441 + te.begin = byte_position - piatokleng; |
|
442 + te.end = byte_position; |
|
443 + |
|
444 + _document.tags.push_back( te ); |
|
445 + |
|
446 +#ifndef WIN32 |
|
447 + } else if ( isspace( c[i] ) ) { |
|
448 +#else |
|
449 + } else if ( (c[i] >= 0) && (isspace( c[i] ) )) { |
|
450 +#endif |
|
451 + |
|
452 + // open tag with attributes, eg. <A HREF="www.foo.com/bar"> |
|
453 + |
|
454 + TagEvent te; |
|
455 + |
|
456 + te.open_tag = true; |
|
457 + |
|
458 + // Ensure tag name is downcased |
|
459 + for ( int j = 0; j < i; j++ ) { |
|
460 + c[j] = tolower( c[j] ); |
|
461 + if ( c[j] == ':' ) c[j] = '_'; /* replace colon (from namespaces) */ |
|
462 + } |
|
463 + |
|
464 + // need to write i characters, plus a NULL |
|
465 + char* write_loc = _termBuffer.write( i + 1 ); |
|
466 + strncpy( write_loc, c, i ); |
|
467 + write_loc[i] = '\0'; |
|
468 + te.name = write_loc; |
|
469 + c += i; |
|
470 + offset += i; |
|
471 + |
|
472 +#ifndef WIN32 |
|
473 + while ( isspace( *c ) ) { c++; offset++; } |
|
474 +#else |
|
475 + while (((*c) >=0) && isspace( *c )) { c++; offset++; } |
|
476 +#endif |
|
477 + |
|
478 + te.pos = _document.terms.size(); |
|
479 + |
|
480 + te.begin = byte_position - piatokleng; |
|
481 + te.end = byte_position; |
|
482 + |
|
483 + // Now search for attributes: |
|
484 + |
|
485 + while ( *c != '>' && *c != '\0' ) { |
|
486 + |
|
487 + AttributeValuePair avp; |
|
488 + |
|
489 + // Try to extract attribute name: |
|
490 + |
|
491 + i = 0; |
|
492 +#ifndef WIN32 |
|
493 + while ( isalnum( c[i] ) || c[i] == '-' || c[i] == '_' ) i++; |
|
494 +#else |
|
495 + while ( (c[i] >= 0) && isalnum( c[i] ) || c[i] == '-' || c[i] == '_') i++; |
|
496 +#endif |
|
497 + |
|
498 + if ( i == 0 ) break; |
|
499 + |
|
500 + // Ensure attribute name is downcased |
|
501 + for ( int j = 0; j < i; j++ ) |
|
502 + c[j] = tolower( c[j] ); |
|
503 + |
|
504 + // need to write i characters, plus a NULL |
|
505 + write_loc = _termBuffer.write( i + 1 ); |
|
506 + strncpy( write_loc, c, i ); |
|
507 + write_loc[i] = '\0'; |
|
508 + avp.attribute = write_loc; |
|
509 + c += i; |
|
510 + offset += i; |
|
511 + |
|
512 + // attributes can be foo\s*=\s*"bar[">] or foo\s*=\s*bar |
|
513 + |
|
514 + // ignore any spaces |
|
515 +#ifndef WIN32 |
|
516 + while ( isspace( *c ) ) { c++; offset++; } |
|
517 +#else |
|
518 + while (((*c) >=0) && isspace( *c )) { c++; offset++; } |
|
519 +#endif |
|
520 + |
|
521 + if ( *c == '=' ) { |
|
522 + |
|
523 + c++; // get past the '=' sign. |
|
524 + offset++; |
|
525 + |
|
526 +#ifndef WIN32 |
|
527 + while ( isspace( *c ) ) { c++; offset++; } |
|
528 +#else |
|
529 + while (((*c) >=0) && isspace( *c )) { c++; offset++; } |
|
530 +#endif |
|
531 + |
|
532 + if ( *c == '>' ) { |
|
533 + |
|
534 + // common malformed markup <a href=> |
|
535 + |
|
536 + // Insert empty attribute value |
|
537 + // need to write a single NULL |
|
538 + write_loc = _termBuffer.write( 1 ); |
|
539 + write_loc[0] = '\0'; |
|
540 + avp.value = write_loc; |
|
541 + avp.begin = byte_position - piatokleng + offset; |
|
542 + avp.end = byte_position - piatokleng + offset; |
|
543 + |
|
544 + } else { |
|
545 + |
|
546 + bool quoted = true; |
|
547 + char quote_char; |
|
548 + if ( *c == '"' || *c =='\'' ) { quote_char = *c; c++; offset++; } |
|
549 + else quoted = false; |
|
550 + |
|
551 + // Attribute value starts here. |
|
552 + |
|
553 + i = 0; |
|
554 +// make sure the opening and closing quote character match... |
|
555 + if ( quoted ) |
|
556 +// while ( c[i] != '"' && c[i] != '>' && c[i] !='\'') i++; |
|
557 + while ( c[i] != quote_char && c[i] != '>') i++; |
|
558 + else |
|
559 +#ifndef WIN32 |
|
560 + while ( ! isspace( c[i] ) && c[i] != '>' ) i++; |
|
561 +#else |
|
562 + while ( ((c[i] >= 0) && ! isspace( c[i] ) ) && c[i] != '>' ) i++; |
|
563 +#endif |
|
564 + |
|
565 + // need to write i characters, plus a NULL |
|
566 + write_loc = _termBuffer.write( i + 1 ); |
|
567 + strncpy( write_loc, c, i ); |
|
568 + write_loc[i] = '\0'; |
|
569 + avp.value = write_loc; |
|
570 + avp.begin = byte_position - piatokleng + offset; |
|
571 + avp.end = byte_position - piatokleng + offset + i; |
|
572 + c += i; |
|
573 + offset += i; |
|
574 + |
|
575 + } |
|
576 + } else { |
|
577 + |
|
578 + // Insert empty attribute value |
|
579 + // need to write a single NULL |
|
580 + write_loc = _termBuffer.write( 1 ); |
|
581 + write_loc[0] = '\0'; |
|
582 + avp.value = write_loc; |
|
583 + avp.begin = byte_position - piatokleng + offset; |
|
584 + avp.end = byte_position - piatokleng + offset; |
|
585 + } |
|
586 +#ifndef WIN32 |
|
587 + while ( isspace( *c ) || *c == '"' ) { c++; offset++; } |
|
588 +#else |
|
589 + while ( ((*c >= 0) && isspace( *c )) || *c == '"' ) { c++; offset++; } |
|
590 +#endif |
|
591 + |
|
592 + te.attributes.push_back( avp ); |
|
593 + } |
|
594 + |
|
595 + _document.tags.push_back( te ); |
|
596 + |
|
597 + } |
|
598 + |
|
599 + // One of the cases that is ignored is this common malformed |
|
600 + // markup <foo=bar> with no tag name. Another is the case |
|
601 + // of an email address <[email protected]> |
|
602 + |
|
603 + |
|
604 + } |
|
605 +} |
|
606 + |
|
607 +void indri::parse::TextTokenizerPIA::processUTF8Token() { |
|
608 + |
|
609 + // A UTF-8 token, as recognized by flex, could actually be |
|
610 + // a mixed ASCII/UTF-8 string containing any number of |
|
611 + // UTF-8 characters, so we re-tokenize it here. |
|
612 + |
|
613 + indri::utility::HashTable<UINT64,const int>& unicode = _transcoder.unicode(); |
|
614 + |
|
615 + int len = strlen( piatoktext ); |
|
616 + |
|
617 + UINT64* unicode_chars = new UINT64[len + 1]; |
|
618 + int* offsets = new int[len + 1]; |
|
619 + int* lengths = new int[len + 1]; |
|
620 + _transcoder.utf8_decode( piatoktext, &unicode_chars, NULL, NULL, |
|
621 + &offsets, &lengths ); |
|
622 + |
|
623 + const int* p; |
|
624 + int cls; // Character class of current UTF-8 character |
|
625 + // offset of current UTF-8 character w/r/t toktext stored in offsets[i] |
|
626 + // byte length of current UTF-8 character stored in lengths[i] |
|
627 + |
|
628 + int offset = 0; // Position of start of current *token* (not character) w/r/t toktext |
|
629 + int extent = 0; // Extent for this *token* including trailing punct |
|
630 + int piatoken_len = 0; // Same as above, minus the trailing punctuation |
|
631 + |
|
632 + char buf[64]; |
|
633 + |
|
634 + // If this flag is true, we have punctuation symbols at the end of a |
|
635 + // token, so do not attach another letter to this token. |
|
636 + bool no_letter = false; |
|
637 + |
|
638 + // In case there are malformed characters preceding the good |
|
639 + // characters: |
|
640 + offset = offsets[0]; |
|
641 + |
|
642 + for ( int i = 0; unicode_chars[i] != 0; i++ ) { |
|
643 + |
|
644 + p = unicode.find( unicode_chars[i] ); |
|
645 + cls = p ? *p : 0; |
|
646 + |
|
647 + if ( ! _tokenize_entire_words ) { // Tokenize by character |
|
648 + |
|
649 + if ( cls != 0 && cls != 3 && cls != 5 && cls != 9 ) { |
|
650 + |
|
651 + writeToken( piatoktext + offsets[i], lengths[i], |
|
652 + byte_position - piatokleng + offsets[i], |
|
653 + byte_position - piatokleng + offsets[i] + lengths[i] ); |
|
654 + } |
|
655 + continue; |
|
656 + } |
|
657 + |
|
658 + // If this is not the first time through this loop, we need |
|
659 + // to check to see if any bytes in toktext were skipped |
|
660 + // during the UTF-8 analysis: |
|
661 + |
|
662 + if ( i != 0 && offset + piatoken_len != offsets[i] ) { |
|
663 + |
|
664 + // Write out the token we are working on, if any: |
|
665 + |
|
666 + if ( piatoken_len > 0 ) { |
|
667 + |
|
668 + writeToken( piatoktext + offset, piatoken_len, |
|
669 + byte_position - piatokleng + offset, |
|
670 + byte_position - piatokleng + offset + extent ); |
|
671 + } |
|
672 + |
|
673 + extent = 0; |
|
674 + piatoken_len = 0; |
|
675 + no_letter = false; |
|
676 + offset = offsets[i]; |
|
677 + } |
|
678 + |
|
679 + // Tokenize by word: |
|
680 + |
|
681 + switch ( cls ) { |
|
682 + |
|
683 + case 4: // Currency symbol: always extracted alone |
|
684 + // Action: write the token we are working on, |
|
685 + // and write this symbol as a separate token |
|
686 + writeToken( piatoktext + offset, extent, |
|
687 + byte_position - piatokleng + offset, |
|
688 + byte_position - piatokleng + offset + extent ); |
|
689 + |
|
690 + offset += extent; |
|
691 + |
|
692 + writeToken( piatoktext + offset, lengths[i], |
|
693 + byte_position - piatokleng + offset, |
|
694 + byte_position - piatokleng + offset + lengths[i] ); |
|
695 + |
|
696 + offset += lengths[i]; |
|
697 + piatoken_len = 0; |
|
698 + extent = 0; |
|
699 + no_letter = false; |
|
700 + break; |
|
701 + |
|
702 + case 1: // Apostrophe |
|
703 + case 10: // Decimal separator |
|
704 + case 6: // Letter |
|
705 + case 7: // Digit |
|
706 + // Action: add this character to the end of the token we are |
|
707 + // working on |
|
708 + if ( no_letter ) { // This is a token boundary |
|
709 + writeToken( piatoktext + offset, piatoken_len, |
|
710 + byte_position - piatokleng + offset, |
|
711 + byte_position - piatokleng + offset + extent ); |
|
712 + |
|
713 + offset += extent; |
|
714 + extent = 0; |
|
715 + piatoken_len = 0; |
|
716 + no_letter = false; |
|
717 + |
|
718 + } |
|
719 + |
|
720 + extent += lengths[i]; |
|
721 + piatoken_len += lengths[i]; |
|
722 + break; |
|
723 + |
|
724 + case 2: // Percent |
|
725 + case 8: // Punctuation |
|
726 + case 12: // Thousands separator |
|
727 + case 11: // Hyphen |
|
728 + // Action: These characters are included in the extent of the |
|
729 + // token we are working on. |
|
730 + no_letter = true; |
|
731 + extent += lengths[i]; |
|
732 + break; |
|
733 + |
|
734 + case 0: // No character class! |
|
735 + case 3: // Control character |
|
736 + case 5: // Non-punctuation symbol |
|
737 + case 9: // Whitespace |
|
738 + default: |
|
739 + // Action: write the token we are working on. Do not include |
|
740 + // this character in any future token. |
|
741 + writeToken( piatoktext + offset, piatoken_len, |
|
742 + byte_position - piatokleng + offset, |
|
743 + byte_position - piatokleng + offset + extent ); |
|
744 + |
|
745 + offset += (extent + lengths[i]); // Include current character |
|
746 + extent = 0; |
|
747 + piatoken_len = 0; |
|
748 + no_letter = false; |
|
749 + |
|
750 + break; |
|
751 + } |
|
752 + } |
|
753 + |
|
754 + // Write out last token |
|
755 + if ( piatoken_len > 0 ) |
|
756 + writeToken( piatoktext + offset, piatoken_len, |
|
757 + byte_position - piatokleng + offset, |
|
758 + byte_position - piatokleng + offset + extent ); |
|
759 + |
|
760 + delete[] unicode_chars; |
|
761 + delete[] offsets; |
|
762 + delete[] lengths; |
|
763 +} |
|
764 + |
|
765 +void indri::parse::TextTokenizerPIA::processASCIIToken() { |
|
766 + |
|
767 + int piatoken_len = strlen( piatoktext ); |
|
768 + |
|
769 + // token_len here is the length of the token without |
|
770 + // any trailing punctuation. |
|
771 + |
|
772 + for ( int i = piatoken_len - 1; i > 0; i-- ) { |
|
773 + |
|
774 + if ( ! ispunct( piatoktext[i] ) ) |
|
775 + break; |
|
776 + else |
|
777 + piatoken_len--; |
|
778 + } |
|
779 + |
|
780 + if ( _tokenize_entire_words ) { |
|
781 + |
|
782 + writeToken( piatoktext, piatoken_len, byte_position - piatokleng, byte_position ); |
|
783 + |
|
784 + } else { |
|
785 + |
|
786 + for ( int i = 0; i < piatoken_len; i++ ) |
|
787 + writeToken( piatoktext + i, 1, byte_position - piatokleng + i, |
|
788 + byte_position - piatokleng + i + 1 ); |
|
789 + } |
|
790 +} |
|
791 + |
|
792 + |
|
793 +// ObjectHandler implementation |
|
794 + |
|
795 +void indri::parse::TextTokenizerPIA::handle( indri::parse::UnparsedDocument* document ) { |
|
796 + |
|
797 + _handler->handle( tokenize( document ) ); |
|
798 +} |
|
799 + |
|
800 +void indri::parse::TextTokenizerPIA::setHandler( ObjectHandler<indri::parse::TokenizedDocument>& h ) { |
|
801 + |
|
802 + _handler = &h; |
|
803 +} |
|
804 + |
|
805 +void indri::parse::TextTokenizerPIA::writeToken( char* token, int piatoken_len, |
|
806 + int extent_begin, int extent_end ) { |
|
807 + |
|
808 + |
|
809 + // The TermExtent for a token will include trailing punctuation. |
|
810 + // The purpose for this is that it makes for a nicer display when a |
|
811 + // sequence of tokens (say, a sentence) is retrieved and shown to |
|
812 + // the user. |
|
813 + |
|
814 + TermExtent extent; |
|
815 + extent.begin = extent_begin; |
|
816 + extent.end = extent_end; |
|
817 + _document.positions.push_back( extent ); |
|
818 + |
|
819 + // The terms entry for a token won't include the punctuation. |
|
820 + |
|
821 + char* write_loc = _termBuffer.write( piatoken_len + 1 ); |
|
822 + strncpy( write_loc, token, piatoken_len ); |
|
823 + write_loc[piatoken_len] = '\0'; |
|
824 + _document.terms.push_back( write_loc ); |
|
825 +} |
|
826 + |
|
827 + |
|
828 --- indri-5.4/include/indri/TextTokenizerPIA.hpp po črc 15 14:38:50 2013 |
|
829 +++ indri-5.4/include/indri/TextTokenizerPIA.hpp po črc 15 14:36:54 2013 |
|
830 @@ -0,0 +1,73 @@ |
|
831 +/*========================================================================== |
|
832 + * Copyright (c) 2003-2005 University of Massachusetts. All Rights Reserved. |
|
833 + * |
|
834 + * Use of the Lemur Toolkit for Language Modeling and Information Retrieval |
|
835 + * is subject to the terms of the software license set forth in the LICENSE |
|
836 + * file included with this software, and also available at |
|
837 + * http://www.lemurproject.org/license.html |
|
838 + * |
|
839 + *========================================================================== |
|
840 + */ |
|
841 + |
|
842 +// |
|
843 +// TextTokenizerPIA |
|
844 +// |
|
845 +// 15 September 2005 -- mwb |
|
846 +// |
|
847 + |
|
848 +#ifndef INDRI_TEXTTOKENIZERPIA_HPP |
|
849 +#define INDRI_TEXTTOKENIZERPIA_HPP |
|
850 + |
|
851 +#include <stdio.h> |
|
852 +#include <string> |
|
853 +#include <map> |
|
854 + |
|
855 +#include "indri/IndriTokenizer.hpp" |
|
856 +#include "indri/Buffer.hpp" |
|
857 +#include "indri/TagEvent.hpp" |
|
858 +#include "indri/UnparsedDocument.hpp" |
|
859 +#include "indri/TokenizedDocument.hpp" |
|
860 +#include "indri/UTF8Transcoder.hpp" |
|
861 + |
|
862 +namespace indri { |
|
863 + namespace parse { |
|
864 + |
|
865 + class TextTokenizerPIA : public Tokenizer { |
|
866 + |
|
867 + public: |
|
868 + TextTokenizerPIA( bool tokenize_markup = true, bool tokenize_entire_words = true ) : _handler(0) { |
|
869 + |
|
870 + _tokenize_markup = tokenize_markup; |
|
871 + _tokenize_entire_words = tokenize_entire_words; |
|
872 + } |
|
873 + |
|
874 + ~TextTokenizerPIA() {} |
|
875 + |
|
876 + TokenizedDocument* tokenize( UnparsedDocument* document ); |
|
877 + |
|
878 + void handle( UnparsedDocument* document ); |
|
879 + void setHandler( ObjectHandler<TokenizedDocument>& h ); |
|
880 + |
|
881 + protected: |
|
882 + void processASCIIToken(); |
|
883 + void processUTF8Token(); |
|
884 + void processTag(); |
|
885 + |
|
886 + indri::utility::Buffer _termBuffer; |
|
887 + UTF8Transcoder _transcoder; |
|
888 + |
|
889 + bool _tokenize_markup; |
|
890 + bool _tokenize_entire_words; |
|
891 + |
|
892 + private: |
|
893 + ObjectHandler<TokenizedDocument>* _handler; |
|
894 + TokenizedDocument _document; |
|
895 + |
|
896 + void writeToken( char* token, int token_len, int extent_begin, |
|
897 + int extent_end ); |
|
898 + }; |
|
899 + } |
|
900 +} |
|
901 + |
|
902 +#endif // INDRI_TEXTTOKENIZERPIA_HPP |
|
903 + |
|
904 --- indri-5.4/src/TokenizerFactory.cpp po črc 15 14:39:30 2013 |
|
905 +++ indri-5.4/src/TokenizerFactory.cpp po črc 15 14:29:11 2013 |
|
906 @@ -22,6 +22,7 @@ |
|
907 |
|
908 #include "indri/TextTokenizer.hpp" |
|
909 // Add an #include for your Tokenizer here. |
|
910 +#include "indri/TextTokenizerPIA.hpp" |
|
911 |
|
912 |
|
913 #define TOKENIZER_WORD ("Word") |
|
914 @@ -29,6 +30,8 @@ |
|
915 #define TOKENIZER_CHAR ("Char") |
|
916 #define TOKENIZER_CHAR_NO_MARKUP ("Char without Markup") |
|
917 // Add a #define for your Tokenizer here. |
|
918 +#define TOKENIZER_PIA ("PIA") |
|
919 +#define TOKENIZER_PIA_NO_MARKUP ("PIA without Markup") |
|
920 |
|
921 |
|
922 // |
|
923 @@ -78,8 +81,23 @@ |
|
924 // got "char" |
|
925 return TOKENIZER_CHAR; |
|
926 |
|
927 + } else if ( ( name[0] == 'p' || name[0] == 'P' ) && |
|
928 + ( name[1] == 'i' || name[1] == 'I' ) && |
|
929 + ( name[2] == 'a' || name[3] == 'A' ) ) { |
|
930 + |
|
931 + if ( name[4] == '-' && |
|
932 + ( name[5] == 'n' || name[5] == 'N' ) && |
|
933 + ( name[5] == 'o' || name[5] == 'O' ) ) { |
|
934 + |
|
935 + // got "pia-nomarkup" |
|
936 + return TOKENIZER_PIA_NO_MARKUP; |
|
937 + } |
|
938 + |
|
939 + // got "pia" |
|
940 + return TOKENIZER_PIA; |
|
941 } |
|
942 |
|
943 + |
|
944 return ""; |
|
945 } |
|
946 |
|
947 @@ -105,6 +123,14 @@ |
|
948 |
|
949 tokenizer = new indri::parse::TextTokenizer( false, false ); |
|
950 |
|
951 + } else if ( preferred == TOKENIZER_PIA ) { |
|
952 + |
|
953 + tokenizer = new indri::parse::TextTokenizerPIA(); |
|
954 + |
|
955 + } else if ( preferred == TOKENIZER_PIA_NO_MARKUP ) { |
|
956 + |
|
957 + tokenizer = new indri::parse::TextTokenizerPIA( false ); |
|
958 + |
|
959 } else { |
|
960 |
|
961 LEMUR_THROW( LEMUR_RUNTIME_ERROR, name + " is not a known tokenizer." ); |
|
962 --- indri-5.4/src/FileClassEnvironmentFactory.cpp po črc 15 14:40:19 2013 |
|
963 +++ indri-5.4/src/FileClassEnvironmentFactory.cpp po črc 15 14:29:12 2013 |
|
964 @@ -189,6 +189,20 @@ |
|
965 trec_conflations // conflations |
|
966 }, |
|
967 { |
|
968 + "trecpia", // name |
|
969 + "xml", // parser |
|
970 + "pia", // tokenizer |
|
971 + "tagged", // iterator |
|
972 + "<DOC>", // startDocTag |
|
973 + "</DOC>", // endDocTag |
|
974 + NULL, // endMetadataTag |
|
975 + trec_include_tags, // includeTags |
|
976 + NULL, // excludeTags |
|
977 + trec_index_tags, // indexTags |
|
978 + trec_metadata_tags, // metadataTags |
|
979 + trec_conflations // conflations |
|
980 + }, |
|
981 + { |
|
982 "trecchar", // name |
|
983 "xml", // parser |
|
984 "char", // tokenizer |
|
985 --- indri-5.4/Makefile.app.in 2013-09-04 06:31:06.740210927 -0700 |
|
986 +++ indri-5.4/Makefile.app.in 2013-09-04 06:27:24.857989779 -0700 |
|
987 @@ -1,22 +1,26 @@ |
|
988 +include MakeDefns |
|
989 + |
|
990 ## your application name here |
|
991 -APP= |
|
992 +APP=pia_wrapper |
|
993 SRC=$(APP).cpp |
|
994 ## extra object files for your app here |
|
995 OBJ= |
|
996 +OUTPUT=lib$(APP).so.1 |
|
997 |
|
998 prefix = @prefix@ |
|
999 exec_prefix = ${prefix} |
|
1000 libdir = @libdir@ |
|
1001 includedir = @includedir@ |
|
1002 -INCPATH=-I$(includedir) |
|
1003 -LIBPATH=-L$(libdir) |
|
1004 +INCPATH=-Iinclude -Icontrib/lemur/include |
|
1005 +LIBPATH=-Lobj |
|
1006 CXXFLAGS=@DEFS@ @CPPFLAGS@ @CXXFLAGS@ $(INCPATH) |
|
1007 -CPPLDFLAGS = @LDFLAGS@ -lindri @LIBS@ |
|
1008 +CPPLDFLAGS = @LDFLAGS@ -lnvpair -lindri @LIBS@ |
|
1009 |
|
1010 all: |
|
1011 - $(CXX) $(CXXFLAGS) $(SRC) -o $(APP) $(OBJ) $(LIBPATH) $(CPPLDFLAGS) |
|
1012 + $(CXX) $(CXXFLAGS) $(SRC) -fpic -shared -static-libgcc -h $(OUTPUT) -o $(OUTPUT) $(OBJ) $(LIBPATH) $(CPPLDFLAGS) |
|
1013 |
|
1014 clean: |
|
1015 rm -f $(APP) |
|
1016 |
|
1017 - |
|
1018 +install: |
|
1019 + cp $(OUTPUT) $(libdir) |
|
1020 --- indri-5.4/Makefile 2013-09-12 07:39:16.027125829 -0700 |
|
1021 +++ indri-5.4/Makefile 2013-09-12 07:38:44.720450641 -0700 |
|
1022 @@ -73,5 +73,6 @@ |
|
1023 $(MAKE) install -C doc |
|
1024 $(MAKE) -C site-search install |
|
1025 $(INSTALL_DATA) Makefile.app $(pkgdatadir) |
|
1026 + $(MAKE) -f Makefile.app install |
|
1027 |
|
1028 test: |