components/indri/patches/pia.patch
changeset 1626 8dee2dfe2525
equal deleted inserted replaced
1625:b35ca5ff8eee 1626:8dee2dfe2525
       
     1 Add our PIA wrapper to indri sources. This patch does several things:
       
     2  - Add pia wrapper sources to indri source tree
       
     3  - Add new tokenizer which does not treat '_' as a separator
       
     4    - The TextTokenizerPIA.l differs from TextTokenizer.l only in single character
       
     5       -[a-zA-Z0-9']+  { byte_position += tokleng; return ASCII_TOKEN; }
       
     6       +[a-zA-Z0-9_']+ { byte_position += tokleng; return ASCII_TOKEN; }
       
     7    - plus many symbol renames so that the parsers can coexist (toktext -> piatoktext etc.)
       
     8    - TextTokenizerPIA.hpp contains only symbol renamse
       
     9  - Rest are modifications to make indri build PIA wrapper
       
    10 
       
    11 
       
    12 --- indri-5.4/pia_wrapper.cpp	po črc 15 14:30:41 2013
       
    13 +++ indri-5.4/pia_wrapper.cpp	po črc 15 14:29:09 2013
       
    14 @@ -0,0 +1,222 @@
       
    15 +/*
       
    16 + * TO compile :
       
    17 + *      g++ -o libpia_wrapper.so -shared -fPIC -I../vlad-libs/sparc/usr/include/ -L../vlad-libs/sparc/usr/lib/ -lclucene-core -lnvpair pia_wrapper.cc
       
    18 + *
       
    19 + */
       
    20 +
       
    21 +#include <sys/stat.h>
       
    22 +#include <strings.h>
       
    23 +#include <stdio.h>
       
    24 +#include <libnvpair.h>
       
    25 +
       
    26 +#include <iostream>
       
    27 +#include <string>
       
    28 +#include <sstream>
       
    29 +#include <fstream>
       
    30 +
       
    31 +#include <vector>
       
    32 +#include "indri/QueryEnvironment.hpp"
       
    33 +#include "indri/SnippetBuilder.hpp"
       
    34 +#include "indri/Repository.hpp"
       
    35 +
       
    36 +using namespace std;
       
    37 +
       
    38 +using namespace indri::api;
       
    39 +
       
    40 +#define MAX_RESULTS 3
       
    41 +#define PIA_DATABASE "/var/db/piadb"
       
    42 +#define PIA_DATABASE_STORAGE PIA_DATABASE "/collection/storage"
       
    43 +
       
    44 +indri::collection::Repository repository;
       
    45 +
       
    46 +std::string
       
    47 +getFieldText(int documentID, std::string field) {
       
    48 +	std::string ret_val = "";
       
    49 +	indri::collection::Repository::index_state repIndexState = repository.indexes();
       
    50 +	indri::index::Index *thisIndex=(*repIndexState)[0];
       
    51 +	int fieldID=thisIndex->field(field);
       
    52 +
       
    53 +	if (fieldID < 1) {
       
    54 +		return "";
       
    55 +	}
       
    56 +
       
    57 +	const indri::index::TermList *termList=thisIndex->termList(documentID);
       
    58 +
       
    59 +	if (!termList) {
       
    60 +		return "";
       
    61 +	}
       
    62 +
       
    63 +	indri::utility::greedy_vector< indri::index::FieldExtent > fieldVec=termList->fields();
       
    64 +	indri::utility::greedy_vector< indri::index::FieldExtent >::iterator fIter=fieldVec.begin();
       
    65 +	while (fIter!=fieldVec.end()) {
       
    66 +
       
    67 +		if ((*fIter).id==fieldID) {
       
    68 +			int beginTerm=(*fIter).begin;
       
    69 +			int endTerm=(*fIter).end;
       
    70 +
       
    71 +	        	/*
       
    72 +	 	 	 * note that the text is inclusive of the beginning
       
    73 +		         * but exclusive of the ending
       
    74 +		 	 */
       
    75 +			for (int t=beginTerm; t < endTerm; t++) {
       
    76 +				int thisTermID=termList->terms()[t];
       
    77 +		       		ret_val = ret_val + thisIndex->term(thisTermID) + " ";
       
    78 +			}
       
    79 +		}
       
    80 +
       
    81 +		fIter++;
       
    82 +	}
       
    83 +
       
    84 +	delete termList;
       
    85 +	termList=NULL;
       
    86 +	return ret_val;
       
    87 +}
       
    88 +
       
    89 +/*
       
    90 + * Returns NULL on failure
       
    91 + * nvlist *
       
    92 + * search(
       
    93 + *  nvlist_t *search_params,
       
    94 + *  char **errmsg            // Similar to pia_index()
       
    95 + * );
       
    96 + */
       
    97 +nvlist *
       
    98 +search (nvlist_t *search_params, char **errmsg) {
       
    99 +
       
   100 +	char *index_path = PIA_DATABASE;
       
   101 +	nvlist_t **nvl_list_result;
       
   102 +	nvlist_t *nvl_return;
       
   103 +	nvlist_t *nvl_result;
       
   104 +	nvlist_t *results = NULL;
       
   105 +
       
   106 +	if (nvlist_alloc(&results, NV_UNIQUE_NAME, 0) != 0) {
       
   107 +		*errmsg = strdup("nvlist_alloc failed\n");
       
   108 +		return NULL;
       
   109 +	}
       
   110 +
       
   111 +	try {
       
   112 +		std::string query;
       
   113 +		char *panicstack;
       
   114 +		(void) nvlist_lookup_string(search_params, "stack", &panicstack);
       
   115 +
       
   116 +		QueryEnvironment indriEnvironment;
       
   117 +		indriEnvironment.addIndex(index_path);
       
   118 +
       
   119 +		/* Create Indri query */
       
   120 +		query = "#combine (" + std::string(panicstack) + ")";
       
   121 +
       
   122 +		QueryAnnotation *QAresults=indriEnvironment.runAnnotatedQuery(query.c_str(), MAX_RESULTS);
       
   123 +
       
   124 +		std::vector<indri::api::ScoredExtentResult> resultVector=QAresults->getResults();
       
   125 +
       
   126 +		int totalNumResults=resultVector.size();
       
   127 +
       
   128 +		/* Get Parsed document of the results */
       
   129 +		std::vector<ParsedDocument*> parsedDocs=indriEnvironment.documents(resultVector);
       
   130 +
       
   131 +		int results_to_return = 0;
       
   132 +		for ( size_t i=0; i < totalNumResults && i < MAX_RESULTS; i++ ) {
       
   133 +				results_to_return++;
       
   134 +		}
       
   135 +
       
   136 +		/* Open Repository */
       
   137 +		repository.openRead(index_path);
       
   138 +
       
   139 +		nvl_list_result = (nvlist_t **) malloc(results_to_return * sizeof(nvlist_t *));
       
   140 +
       
   141 +		for ( size_t i=0; i < results_to_return; i++ ) {
       
   142 +
       
   143 +			std::string ret="";
       
   144 +
       
   145 +			int thisResultDocID=resultVector[i].document;
       
   146 +
       
   147 +			if (nvlist_alloc(&nvl_list_result[i], NV_UNIQUE_NAME, 0) != 0) {
       
   148 +				*errmsg = strdup("nvlist_alloc failed\n");
       
   149 +				return NULL;
       
   150 +			}
       
   151 +
       
   152 +			if ((ret = getFieldText(thisResultDocID, "bug")) == "") {
       
   153 +				*errmsg = strdup("Lookup of bugid failed\n");
       
   154 +				return NULL;
       
   155 +			} else if (nvlist_add_string(nvl_list_result[i], "pia-bugid", ret.c_str())) {
       
   156 +				*errmsg = strdup("nvlist_add bugid failed\n");
       
   157 +				return NULL;
       
   158 +			}
       
   159 +
       
   160 +			if ((ret = getFieldText(thisResultDocID, "stack")) == "") {
       
   161 +				*errmsg = strdup("Lookup of stack failed\n");
       
   162 +				return NULL;
       
   163 +			} else if (nvlist_add_string(nvl_list_result[i], "pia-stack", ret.c_str())) {
       
   164 +				*errmsg = strdup("nvlist_add stack failed\n");
       
   165 +				return NULL;
       
   166 +			}
       
   167 +
       
   168 +			if ((ret = getFieldText(thisResultDocID, "signature")) == "") {
       
   169 +				*errmsg = strdup("Lookup of signature failed\n");
       
   170 +				return NULL;
       
   171 +			} else if (nvlist_add_string(nvl_list_result[i], "pia-signature", ret.c_str())) {
       
   172 +				*errmsg = strdup("nvlist_add signature failed\n");
       
   173 +				return NULL;
       
   174 +			}
       
   175 +
       
   176 +			int indri_score = 1000 + (int)resultVector[i].score*1000;
       
   177 +			if (nvlist_add_int32(nvl_list_result[i], "pia-score", indri_score)) {
       
   178 +				*errmsg = strdup("nvlist_add score failed\n");
       
   179 +				return NULL;
       
   180 +			}
       
   181 +		}
       
   182 +		repository.close();
       
   183 +
       
   184 +		nvlist_add_nvlist_array(results, "results", nvl_list_result, results_to_return);
       
   185 +
       
   186 +		for (int i=0; i<results_to_return; i++) {
       
   187 +			nvlist_free(nvl_list_result[i]);
       
   188 +		}
       
   189 +
       
   190 +		return results;
       
   191 +
       
   192 +	} catch(...){
       
   193 +		nvl_list_result = (nvlist_t **) malloc(1 * sizeof(nvlist_t **));
       
   194 +
       
   195 +		if (nvlist_alloc(&nvl_result, NV_UNIQUE_NAME, 0) != 0) {
       
   196 +			*errmsg = strdup("nvlist_alloc failed\n");
       
   197 +			return NULL;
       
   198 +		}
       
   199 +
       
   200 +		if (nvlist_add_string(nvl_result, "error", "Indri Error")) {
       
   201 +			*errmsg = strdup("nvlist_add error failed\n");
       
   202 +			return NULL;
       
   203 +                }
       
   204 +
       
   205 +		nvlist_dup(nvl_result, &nvl_list_result[0], 0);
       
   206 +		nvlist_free(nvl_result);
       
   207 +		nvlist_add_nvlist_array(results, "results", nvl_list_result, 1);
       
   208 +
       
   209 +		return results;
       
   210 +        }
       
   211 +}
       
   212 +
       
   213 +extern "C" nvlist*
       
   214 +pia_search (nvlist_t *search_params, char **errmsg) {
       
   215 +
       
   216 +	return search (search_params, errmsg);
       
   217 +
       
   218 +}
       
   219 +
       
   220 +int
       
   221 +init () {
       
   222 +
       
   223 +	struct stat sb;
       
   224 +	if (stat(PIA_DATABASE_STORAGE, &sb) != 0) {
       
   225 +		return 1;
       
   226 +	}
       
   227 +
       
   228 +	return 0;
       
   229 +}
       
   230 +
       
   231 +extern "C" int
       
   232 +pia_init () {
       
   233 +
       
   234 +	return init ();
       
   235 +
       
   236 +}
       
   237 --- indri-5.4/src/TextTokenizerPIA.l	po črc 15 14:38:12 2013
       
   238 +++ indri-5.4/src/TextTokenizerPIA.l	po črc 15 14:36:55 2013
       
   239 @@ -0,0 +1,588 @@
       
   240 +%option noyywrap
       
   241 +%option never-interactive
       
   242 +%option prefix="piatok"
       
   243 +
       
   244 +%{
       
   245 +
       
   246 +/*==========================================================================
       
   247 + * Copyright (c) 2004 University of Massachusetts.  All Rights Reserved.
       
   248 + *
       
   249 + * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
       
   250 + * is subject to the terms of the software license set forth in the LICENSE
       
   251 + * file included with this software, and also available at
       
   252 + * http://www.lemurproject.org/license.html
       
   253 + *
       
   254 + *==========================================================================
       
   255 + */
       
   256 +
       
   257 +//
       
   258 +// TextTokenizerPIA
       
   259 +//
       
   260 +// 15 September 2005 -- mwb
       
   261 +//
       
   262 +
       
   263 +#include <string.h>
       
   264 +#include <ctype.h>
       
   265 +#include "indri/TextTokenizerPIA.hpp"
       
   266 +#include "indri/TermExtent.hpp"
       
   267 +#include "indri/TagEvent.hpp"
       
   268 +#include "indri/TokenizedDocument.hpp"
       
   269 +#include "indri/UnparsedDocument.hpp"
       
   270 +#include "indri/UTF8Transcoder.hpp"
       
   271 +#include "indri/AttributeValuePair.hpp"
       
   272 +
       
   273 +static long byte_position;
       
   274 +
       
   275 +#define ZAP           1
       
   276 +#define TAG           2
       
   277 +#define ASCII_TOKEN   3
       
   278 +#define UTF8_TOKEN    4
       
   279 +
       
   280 +%}
       
   281 +%start COMMENT
       
   282 +%%
       
   283 +
       
   284 +"<!--" { BEGIN(COMMENT); byte_position += piatokleng; return ZAP; }
       
   285 +<COMMENT>[^-]+ { byte_position += piatokleng; return ZAP; }
       
   286 +<COMMENT>"-->" { BEGIN(INITIAL); byte_position += piatokleng; return ZAP; }
       
   287 +<COMMENT>"-" { byte_position += piatokleng; return ZAP; }
       
   288 +"<!"[^\>]*">" { byte_position += piatokleng; return ZAP; }
       
   289 +\<[a-zA-Z/][^\>]*\>                                             { byte_position += piatokleng; return TAG; }
       
   290 +[&]([a-zA-Z]+|[#]([0-9]+|[xX][a-fA-F0-9]+))[;]         { byte_position += piatokleng; return ZAP; /* symbols */ }
       
   291 +[A-Z0-9]"."([A-Z0-9]".")*                                        { byte_position += piatokleng; return ASCII_TOKEN; }
       
   292 +[a-zA-Z0-9_']+                                        { byte_position += piatokleng; return ASCII_TOKEN; }
       
   293 +"-"[0-9]+("."[0-9]+)?                                  { byte_position += piatokleng; return ASCII_TOKEN; }
       
   294 +[a-zA-Z0-9\x80-\xFD]+                               { byte_position += piatokleng; return UTF8_TOKEN; }
       
   295 +
       
   296 +[\n]                                                   { byte_position += piatokleng; return ZAP; }
       
   297 +.                                                      { byte_position += piatokleng; return ZAP; }
       
   298 +
       
   299 +%%
       
   300 +
       
   301 +indri::parse::TokenizedDocument* indri::parse::TextTokenizerPIA::tokenize( indri::parse::UnparsedDocument* document ) {
       
   302 +
       
   303 +  _termBuffer.clear();
       
   304 +  if ( _tokenize_entire_words)
       
   305 +    _termBuffer.grow( document->textLength * 4);
       
   306 +  else
       
   307 +    _termBuffer.grow( document->textLength * 8 ); // extra null per char.
       
   308 +
       
   309 +  _document.terms.clear();
       
   310 +  _document.tags.clear();
       
   311 +  _document.positions.clear();
       
   312 +
       
   313 +  _document.metadata = document->metadata;
       
   314 +  _document.text = document->text;
       
   315 +  _document.textLength = document->textLength;
       
   316 +  _document.content = document->content;
       
   317 +  _document.contentLength = document->contentLength;
       
   318 +
       
   319 +  // byte offset
       
   320 +  byte_position = document->content - document->text;
       
   321 +
       
   322 +  piatok_scan_bytes( document->content, document->contentLength );
       
   323 +
       
   324 +  // Main Tokenizer loop
       
   325 +
       
   326 +  int type;
       
   327 +
       
   328 +  while ( type = piatoklex() ) {
       
   329 +
       
   330 +    switch ( type ) {
       
   331 +
       
   332 +    case ASCII_TOKEN: processASCIIToken(); break;
       
   333 +
       
   334 +    case UTF8_TOKEN: processUTF8Token(); break;
       
   335 +
       
   336 +    case TAG: if ( _tokenize_markup ) processTag(); break;
       
   337 +
       
   338 +    default:
       
   339 +    case ZAP:
       
   340 +      break;
       
   341 +
       
   342 +    }
       
   343 +
       
   344 +  }
       
   345 +
       
   346 +  piatok_delete_buffer( YY_CURRENT_BUFFER );
       
   347 +
       
   348 +  return &_document;
       
   349 +}
       
   350 +
       
   351 +// Member functions for processing tokenization events as dispatched
       
   352 +// from the main tokenizer loop
       
   353 +
       
   354 +void indri::parse::TextTokenizerPIA::processTag() {
       
   355 +
       
   356 +  // Here, we parse the tag in a fashion that is relatively robust to
       
   357 +  // malformed markup.  toktext matches this pattern: <[^>]+>
       
   358 +
       
   359 +  if ( piatoktext[1] == '?' || piatoktext[1] == '!' ) {
       
   360 +
       
   361 +    // XML declaration like <? ... ?> and <!DOCTYPE ... >
       
   362 +    return; // ignore
       
   363 +
       
   364 +  } else if ( piatoktext[1] == '/' ) { // close tag, eg. </FOO>
       
   365 +
       
   366 +    // Downcase the tag name.
       
   367 +
       
   368 +    int len = 0;
       
   369 +
       
   370 +    for ( char *c = piatoktext + 2;
       
   371 +#ifndef WIN32
       
   372 +          isalnum( *c ) || *c == '-' || *c == '_' || *c == ':' ; c++ ) {
       
   373 +#else
       
   374 +          ((*c >= 0) && isalnum( *c )) || *c == '-' || *c == '_' || *c == ':' ; c++ ) {
       
   375 +#endif
       
   376 +
       
   377 +      *c = tolower( *c );
       
   378 +      if ( *c == ':' ) *c = '_'; /* replace colon (from namespaces) */
       
   379 +      len++;
       
   380 +    }
       
   381 +
       
   382 +    TagEvent te;
       
   383 +
       
   384 +    te.open_tag = false;
       
   385 +
       
   386 +    // We need to write len characters, plus a NULL
       
   387 +    char* write_loc = _termBuffer.write( len + 1 );
       
   388 +    strncpy( write_loc, piatoktext + 2, len );
       
   389 +    write_loc[len] = '\0';
       
   390 +    te.name = write_loc;
       
   391 +
       
   392 +    // token position of tag event w/r/t token string
       
   393 +    te.pos = _document.terms.size();
       
   394 +
       
   395 +    te.begin = byte_position - piatokleng;
       
   396 +    te.end = byte_position;
       
   397 +
       
   398 +    _document.tags.push_back( te );
       
   399 +
       
   400 +#ifndef WIN32
       
   401 +    } else if ( isalpha( piatoktext[1] ) ) {
       
   402 +#else
       
   403 +    } else if ( (piatoktext[1]  >= 0) && (isalpha( piatoktext[1] ) )) {
       
   404 +#endif
       
   405 +
       
   406 +    // Try to extract the tag name:
       
   407 +
       
   408 +    char* c = piatoktext + 1;
       
   409 +    int i = 0;
       
   410 +    int offset = 1; // current offset w/r/t byte_position - piatokleng
       
   411 +    // it starts at one because it is incremented when c is, and c starts at one.
       
   412 +    char* write_loc;
       
   413 +
       
   414 +#ifndef WIN32
       
   415 +    while ( isalnum( c[i] ) || c[i] == '-' || c[i] == '_' || c[i] == ':' ) i++;
       
   416 +#else
       
   417 +    while ( ( (c[i] >= 0) && isalnum( c[i] )) || c[i] == '-' || c[i] == '_' || c[i] == ':' ) i++;
       
   418 +#endif
       
   419 +    if ( c[i] == '>' ) {
       
   420 +
       
   421 +      // open tag with no attributes, eg. <title>
       
   422 +
       
   423 +      // Ensure tag name is downcased
       
   424 +      for ( int j = 0; j < i; j++ ) {
       
   425 +        c[j] = tolower( c[j] );
       
   426 +        if ( c[j] == ':' ) c[j] = '_'; /* replace colon (from namespaces) */
       
   427 +      }
       
   428 +
       
   429 +      TagEvent te;
       
   430 +
       
   431 +      te.open_tag = true;
       
   432 +
       
   433 +      // need to write i characters, plus a NULL
       
   434 +      char* write_loc = _termBuffer.write( i + 1 );
       
   435 +      strncpy( write_loc, c, i );
       
   436 +      write_loc[i] = '\0';
       
   437 +      te.name = write_loc;
       
   438 +
       
   439 +      te.pos = _document.terms.size();
       
   440 +
       
   441 +      te.begin = byte_position - piatokleng;
       
   442 +      te.end = byte_position;
       
   443 +
       
   444 +      _document.tags.push_back( te );
       
   445 +
       
   446 +#ifndef WIN32
       
   447 +    } else if ( isspace( c[i] ) ) {
       
   448 +#else
       
   449 +    } else if ( (c[i]  >= 0) && (isspace( c[i] ) )) {
       
   450 +#endif
       
   451 +
       
   452 +      // open tag with attributes, eg. <A HREF="www.foo.com/bar">
       
   453 +
       
   454 +      TagEvent te;
       
   455 +
       
   456 +      te.open_tag = true;
       
   457 +
       
   458 +      // Ensure tag name is downcased
       
   459 +      for ( int j = 0; j < i; j++ ) {
       
   460 +        c[j] = tolower( c[j] );
       
   461 +        if ( c[j] == ':' ) c[j] = '_'; /* replace colon (from namespaces) */
       
   462 +      }
       
   463 +
       
   464 +      // need to write i characters, plus a NULL
       
   465 +      char* write_loc = _termBuffer.write( i + 1 );
       
   466 +      strncpy( write_loc, c, i );
       
   467 +      write_loc[i] = '\0';
       
   468 +      te.name = write_loc;
       
   469 +      c += i;
       
   470 +      offset += i;
       
   471 +
       
   472 +#ifndef WIN32
       
   473 +    while ( isspace( *c ) ) { c++; offset++; }
       
   474 +#else
       
   475 +    while (((*c) >=0) &&  isspace( *c )) { c++; offset++; }
       
   476 +#endif
       
   477 +
       
   478 +      te.pos = _document.terms.size();
       
   479 +
       
   480 +      te.begin = byte_position - piatokleng;
       
   481 +      te.end = byte_position;
       
   482 +
       
   483 +      // Now search for attributes:
       
   484 +
       
   485 +      while ( *c != '>' && *c != '\0' ) {
       
   486 +
       
   487 +        AttributeValuePair avp;
       
   488 +
       
   489 +        // Try to extract attribute name:
       
   490 +
       
   491 +        i = 0;
       
   492 +#ifndef WIN32
       
   493 +        while ( isalnum( c[i] ) || c[i] == '-' || c[i] == '_' ) i++;
       
   494 +#else
       
   495 +        while ( (c[i] >= 0) && isalnum( c[i] ) || c[i] == '-' || c[i] == '_') i++;
       
   496 +#endif
       
   497 +
       
   498 +        if ( i == 0 ) break;
       
   499 +
       
   500 +        // Ensure attribute name is downcased
       
   501 +        for ( int j = 0; j < i; j++ )
       
   502 +          c[j] = tolower( c[j] );
       
   503 +
       
   504 +        // need to write i characters, plus a NULL
       
   505 +        write_loc = _termBuffer.write( i + 1 );
       
   506 +        strncpy( write_loc, c, i );
       
   507 +        write_loc[i] = '\0';
       
   508 +        avp.attribute = write_loc;
       
   509 +        c += i;
       
   510 +        offset += i;
       
   511 +
       
   512 +        // attributes can be foo\s*=\s*"bar[">] or foo\s*=\s*bar
       
   513 +
       
   514 +		// ignore any spaces
       
   515 +#ifndef WIN32
       
   516 +    while ( isspace( *c ) ) { c++; offset++; }
       
   517 +#else
       
   518 +    while (((*c) >=0) &&  isspace( *c )) { c++; offset++; }
       
   519 +#endif
       
   520 +
       
   521 +        if ( *c == '=' ) {
       
   522 +
       
   523 +          c++; // get past the '=' sign.
       
   524 +          offset++;
       
   525 +
       
   526 +#ifndef WIN32
       
   527 +    while ( isspace( *c ) ) { c++; offset++; }
       
   528 +#else
       
   529 +    while (((*c) >=0) &&  isspace( *c )) { c++; offset++; }
       
   530 +#endif
       
   531 +
       
   532 +          if ( *c == '>' ) {
       
   533 +
       
   534 +            // common malformed markup <a href=>
       
   535 +
       
   536 +            // Insert empty attribute value
       
   537 +            // need to write a single NULL
       
   538 +            write_loc = _termBuffer.write( 1 );
       
   539 +            write_loc[0] = '\0';
       
   540 +            avp.value = write_loc;
       
   541 +            avp.begin = byte_position - piatokleng + offset;
       
   542 +            avp.end = byte_position - piatokleng + offset;
       
   543 +
       
   544 +          } else {
       
   545 +
       
   546 +            bool quoted = true;
       
   547 +            char quote_char;
       
   548 +            if ( *c == '"' || *c =='\'' ) { quote_char = *c; c++; offset++; }
       
   549 +            else quoted = false;
       
   550 +
       
   551 +            // Attribute value starts here.
       
   552 +
       
   553 +            i = 0;
       
   554 +// make sure the opening and closing quote character match...
       
   555 +            if ( quoted )
       
   556 +//              while ( c[i] != '"' && c[i] != '>' && c[i] !='\'') i++;
       
   557 +              while ( c[i] != quote_char && c[i] != '>') i++;
       
   558 +            else
       
   559 +#ifndef WIN32
       
   560 +              while ( ! isspace( c[i] ) && c[i] != '>' ) i++;
       
   561 +#else
       
   562 +              while ( ((c[i] >= 0)  && ! isspace( c[i] ) ) && c[i] != '>' ) i++;
       
   563 +#endif
       
   564 +
       
   565 +            // need to write i characters, plus a NULL
       
   566 +            write_loc = _termBuffer.write( i + 1 );
       
   567 +            strncpy( write_loc, c, i );
       
   568 +            write_loc[i] = '\0';
       
   569 +            avp.value = write_loc;
       
   570 +            avp.begin = byte_position - piatokleng + offset;
       
   571 +            avp.end = byte_position - piatokleng + offset + i;
       
   572 +            c += i;
       
   573 +            offset += i;
       
   574 +
       
   575 +          }
       
   576 +        } else {
       
   577 +
       
   578 +          // Insert empty attribute value
       
   579 +          // need to write a single NULL
       
   580 +          write_loc = _termBuffer.write( 1 );
       
   581 +          write_loc[0] = '\0';
       
   582 +          avp.value = write_loc;
       
   583 +          avp.begin = byte_position - piatokleng + offset;
       
   584 +          avp.end = byte_position - piatokleng + offset;
       
   585 +        }
       
   586 +#ifndef WIN32
       
   587 +        while ( isspace( *c ) || *c == '"' ) { c++; offset++; }
       
   588 +#else
       
   589 +        while ( ((*c >= 0) && isspace( *c )) || *c == '"' ) { c++; offset++; }
       
   590 +#endif
       
   591 +
       
   592 +        te.attributes.push_back( avp );
       
   593 +      }
       
   594 +
       
   595 +      _document.tags.push_back( te );
       
   596 +
       
   597 +    }
       
   598 +
       
   599 +    // One of the cases that is ignored is this common malformed
       
   600 +    // markup <foo=bar> with no tag name.  Another is the case
       
   601 +    // of an email address <[email protected]>
       
   602 +
       
   603 +
       
   604 +  }
       
   605 +}
       
   606 +
       
   607 +void indri::parse::TextTokenizerPIA::processUTF8Token() {
       
   608 +
       
   609 +  // A UTF-8 token, as recognized by flex, could actually be
       
   610 +  // a mixed ASCII/UTF-8 string containing any number of
       
   611 +  // UTF-8 characters, so we re-tokenize it here.
       
   612 +
       
   613 +  indri::utility::HashTable<UINT64,const int>& unicode = _transcoder.unicode();
       
   614 +
       
   615 +  int len = strlen( piatoktext );
       
   616 +
       
   617 +  UINT64* unicode_chars = new UINT64[len + 1];
       
   618 +  int* offsets = new int[len + 1];
       
   619 +  int* lengths = new int[len + 1];
       
   620 +  _transcoder.utf8_decode( piatoktext, &unicode_chars, NULL, NULL,
       
   621 +                           &offsets, &lengths );
       
   622 +
       
   623 +  const int* p;
       
   624 +  int cls;             // Character class of current UTF-8 character
       
   625 +  // offset of current UTF-8 character w/r/t toktext stored in offsets[i]
       
   626 +  // byte length of current UTF-8 character stored in lengths[i]
       
   627 +
       
   628 +  int offset = 0;      // Position of start of current *token* (not character) w/r/t toktext
       
   629 +  int extent = 0;      // Extent for this *token* including trailing punct
       
   630 +  int piatoken_len = 0;   // Same as above, minus the trailing punctuation
       
   631 +
       
   632 +  char buf[64];
       
   633 +
       
   634 +  // If this flag is true, we have punctuation symbols at the end of a
       
   635 +  // token, so do not attach another letter to this token.
       
   636 +  bool no_letter = false;
       
   637 +
       
   638 +  // In case there are malformed characters preceding the good
       
   639 +  // characters:
       
   640 +  offset = offsets[0];
       
   641 +
       
   642 +  for ( int i = 0; unicode_chars[i] != 0; i++ ) {
       
   643 +
       
   644 +    p = unicode.find( unicode_chars[i] );
       
   645 +    cls = p ? *p : 0;
       
   646 +
       
   647 +    if ( ! _tokenize_entire_words ) { // Tokenize by character
       
   648 +
       
   649 +      if ( cls != 0 && cls != 3 && cls != 5 && cls != 9 ) {
       
   650 +
       
   651 +        writeToken( piatoktext + offsets[i], lengths[i],
       
   652 +                    byte_position - piatokleng + offsets[i],
       
   653 +                    byte_position - piatokleng + offsets[i] + lengths[i] );
       
   654 +      }
       
   655 +      continue;
       
   656 +    }
       
   657 +
       
   658 +    // If this is not the first time through this loop, we need
       
   659 +    // to check to see if any bytes in toktext were skipped
       
   660 +    // during the UTF-8 analysis:
       
   661 +
       
   662 +    if ( i != 0 && offset + piatoken_len != offsets[i] ) {
       
   663 +
       
   664 +      // Write out the token we are working on, if any:
       
   665 +
       
   666 +      if ( piatoken_len > 0 ) {
       
   667 +
       
   668 +        writeToken( piatoktext + offset, piatoken_len,
       
   669 +                    byte_position - piatokleng + offset,
       
   670 +                    byte_position - piatokleng + offset + extent );
       
   671 +      }
       
   672 +
       
   673 +      extent = 0;
       
   674 +      piatoken_len = 0;
       
   675 +      no_letter = false;
       
   676 +      offset = offsets[i];
       
   677 +    }
       
   678 +
       
   679 +    // Tokenize by word:
       
   680 +
       
   681 +    switch ( cls ) {
       
   682 +
       
   683 +    case 4: // Currency symbol: always extracted alone
       
   684 +      // Action: write the token we are working on,
       
   685 +      // and write this symbol as a separate token
       
   686 +      writeToken( piatoktext + offset, extent,
       
   687 +                  byte_position - piatokleng + offset,
       
   688 +                  byte_position - piatokleng + offset + extent );
       
   689 +
       
   690 +      offset += extent;
       
   691 +
       
   692 +      writeToken( piatoktext + offset, lengths[i],
       
   693 +                  byte_position - piatokleng + offset,
       
   694 +                  byte_position - piatokleng + offset + lengths[i] );
       
   695 +
       
   696 +      offset += lengths[i];
       
   697 +      piatoken_len = 0;
       
   698 +      extent = 0;
       
   699 +      no_letter = false;
       
   700 +      break;
       
   701 +
       
   702 +    case 1: // Apostrophe
       
   703 +    case 10: // Decimal separator
       
   704 +    case 6: // Letter
       
   705 +    case 7: // Digit
       
   706 +      // Action: add this character to the end of the token we are
       
   707 +      // working on
       
   708 +      if ( no_letter ) { // This is a token boundary
       
   709 +        writeToken( piatoktext + offset, piatoken_len,
       
   710 +                    byte_position - piatokleng + offset,
       
   711 +                    byte_position - piatokleng + offset + extent );
       
   712 +
       
   713 +        offset += extent;
       
   714 +        extent = 0;
       
   715 +        piatoken_len = 0;
       
   716 +        no_letter = false;
       
   717 +
       
   718 +      }
       
   719 +
       
   720 +      extent += lengths[i];
       
   721 +      piatoken_len += lengths[i];
       
   722 +      break;
       
   723 +
       
   724 +    case 2: // Percent
       
   725 +    case 8: // Punctuation
       
   726 +    case 12: // Thousands separator
       
   727 +    case 11: // Hyphen
       
   728 +      // Action: These characters are included in the extent of the
       
   729 +      // token we are working on.
       
   730 +      no_letter = true;
       
   731 +      extent += lengths[i];
       
   732 +      break;
       
   733 +
       
   734 +    case 0: // No character class!
       
   735 +    case 3: // Control character
       
   736 +    case 5: // Non-punctuation symbol
       
   737 +    case 9: // Whitespace
       
   738 +    default:
       
   739 +      // Action: write the token we are working on.  Do not include
       
   740 +      // this character in any future token.
       
   741 +      writeToken( piatoktext + offset, piatoken_len,
       
   742 +                  byte_position - piatokleng + offset,
       
   743 +                  byte_position - piatokleng + offset + extent );
       
   744 +
       
   745 +      offset += (extent + lengths[i]); // Include current character
       
   746 +      extent = 0;
       
   747 +      piatoken_len = 0;
       
   748 +      no_letter = false;
       
   749 +
       
   750 +      break;
       
   751 +    }
       
   752 +  }
       
   753 +
       
   754 +  // Write out last token
       
   755 +  if ( piatoken_len > 0 )
       
   756 +    writeToken( piatoktext + offset, piatoken_len,
       
   757 +                byte_position - piatokleng + offset,
       
   758 +                byte_position - piatokleng + offset + extent );
       
   759 +
       
   760 +  delete[] unicode_chars;
       
   761 +  delete[] offsets;
       
   762 +  delete[] lengths;
       
   763 +}
       
   764 +
       
   765 +void indri::parse::TextTokenizerPIA::processASCIIToken() {
       
   766 +
       
   767 +  int piatoken_len = strlen( piatoktext );
       
   768 +
       
   769 +  // token_len here is the length of the token without
       
   770 +  // any trailing punctuation.
       
   771 +
       
   772 +  for ( int i = piatoken_len - 1; i > 0; i-- ) {
       
   773 +
       
   774 +    if ( ! ispunct( piatoktext[i] ) )
       
   775 +      break;
       
   776 +    else
       
   777 +      piatoken_len--;
       
   778 +  }
       
   779 +
       
   780 +  if ( _tokenize_entire_words ) {
       
   781 +
       
   782 +    writeToken( piatoktext, piatoken_len, byte_position - piatokleng, byte_position );
       
   783 +
       
   784 +  } else {
       
   785 +
       
   786 +    for ( int i = 0; i < piatoken_len; i++ )
       
   787 +      writeToken( piatoktext + i, 1, byte_position - piatokleng + i,
       
   788 +                  byte_position - piatokleng + i + 1 );
       
   789 +  }
       
   790 +}
       
   791 +
       
   792 +
       
   793 +// ObjectHandler implementation
       
   794 +
       
   795 +void indri::parse::TextTokenizerPIA::handle( indri::parse::UnparsedDocument* document ) {
       
   796 +
       
   797 +  _handler->handle( tokenize( document ) );
       
   798 +}
       
   799 +
       
   800 +void indri::parse::TextTokenizerPIA::setHandler( ObjectHandler<indri::parse::TokenizedDocument>& h ) {
       
   801 +
       
   802 +  _handler = &h;
       
   803 +}
       
   804 +
       
   805 +void indri::parse::TextTokenizerPIA::writeToken( char* token, int piatoken_len,
       
   806 +                                              int extent_begin, int extent_end ) {
       
   807 +
       
   808 +
       
   809 +  // The TermExtent for a token will include trailing punctuation.
       
   810 +  // The purpose for this is that it makes for a nicer display when a
       
   811 +  // sequence of tokens (say, a sentence) is retrieved and shown to
       
   812 +  // the user.
       
   813 +
       
   814 +  TermExtent extent;
       
   815 +  extent.begin = extent_begin;
       
   816 +  extent.end = extent_end;
       
   817 +  _document.positions.push_back( extent );
       
   818 +
       
   819 +  // The terms entry for a token won't include the punctuation.
       
   820 +
       
   821 +  char* write_loc = _termBuffer.write( piatoken_len + 1 );
       
   822 +  strncpy( write_loc, token, piatoken_len );
       
   823 +  write_loc[piatoken_len] = '\0';
       
   824 +  _document.terms.push_back( write_loc );
       
   825 +}
       
   826 +
       
   827 +
       
   828 --- indri-5.4/include/indri/TextTokenizerPIA.hpp	po črc 15 14:38:50 2013
       
   829 +++ indri-5.4/include/indri/TextTokenizerPIA.hpp	po črc 15 14:36:54 2013
       
   830 @@ -0,0 +1,73 @@
       
   831 +/*==========================================================================
       
   832 + * Copyright (c) 2003-2005 University of Massachusetts.  All Rights Reserved.
       
   833 + *
       
   834 + * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
       
   835 + * is subject to the terms of the software license set forth in the LICENSE
       
   836 + * file included with this software, and also available at
       
   837 + * http://www.lemurproject.org/license.html
       
   838 + *
       
   839 + *==========================================================================
       
   840 + */
       
   841 +
       
   842 +//
       
   843 +// TextTokenizerPIA
       
   844 +//
       
   845 +// 15 September 2005 -- mwb
       
   846 +//
       
   847 +
       
   848 +#ifndef INDRI_TEXTTOKENIZERPIA_HPP
       
   849 +#define INDRI_TEXTTOKENIZERPIA_HPP
       
   850 +
       
   851 +#include <stdio.h>
       
   852 +#include <string>
       
   853 +#include <map>
       
   854 +
       
   855 +#include "indri/IndriTokenizer.hpp"
       
   856 +#include "indri/Buffer.hpp"
       
   857 +#include "indri/TagEvent.hpp"
       
   858 +#include "indri/UnparsedDocument.hpp"
       
   859 +#include "indri/TokenizedDocument.hpp"
       
   860 +#include "indri/UTF8Transcoder.hpp"
       
   861 +
       
   862 +namespace indri {
       
   863 +  namespace parse {
       
   864 +
       
   865 +    class TextTokenizerPIA : public Tokenizer {
       
   866 +
       
   867 +    public:
       
   868 +      TextTokenizerPIA( bool tokenize_markup = true, bool tokenize_entire_words = true ) : _handler(0) {
       
   869 +
       
   870 +        _tokenize_markup = tokenize_markup;
       
   871 +        _tokenize_entire_words = tokenize_entire_words;
       
   872 +      }
       
   873 +
       
   874 +      ~TextTokenizerPIA() {}
       
   875 +
       
   876 +      TokenizedDocument* tokenize( UnparsedDocument* document );
       
   877 +
       
   878 +      void handle( UnparsedDocument* document );
       
   879 +      void setHandler( ObjectHandler<TokenizedDocument>& h );
       
   880 +
       
   881 +    protected:
       
   882 +      void processASCIIToken();
       
   883 +      void processUTF8Token();
       
   884 +      void processTag();
       
   885 +
       
   886 +      indri::utility::Buffer _termBuffer;
       
   887 +      UTF8Transcoder _transcoder;
       
   888 +
       
   889 +      bool _tokenize_markup;
       
   890 +      bool _tokenize_entire_words;
       
   891 +
       
   892 +    private:
       
   893 +      ObjectHandler<TokenizedDocument>* _handler;
       
   894 +      TokenizedDocument _document;
       
   895 +
       
   896 +      void writeToken( char* token, int token_len, int extent_begin,
       
   897 +                       int extent_end );
       
   898 +    };
       
   899 +  }
       
   900 +}
       
   901 +
       
   902 +#endif // INDRI_TEXTTOKENIZERPIA_HPP
       
   903 +
       
   904 --- indri-5.4/src/TokenizerFactory.cpp	po črc 15 14:39:30 2013
       
   905 +++ indri-5.4/src/TokenizerFactory.cpp	po črc 15 14:29:11 2013
       
   906 @@ -22,6 +22,7 @@
       
   907 
       
   908  #include "indri/TextTokenizer.hpp"
       
   909  // Add an #include for your Tokenizer here.
       
   910 +#include "indri/TextTokenizerPIA.hpp"
       
   911 
       
   912 
       
   913  #define TOKENIZER_WORD ("Word")
       
   914 @@ -29,6 +30,8 @@
       
   915  #define TOKENIZER_CHAR ("Char")
       
   916  #define TOKENIZER_CHAR_NO_MARKUP ("Char without Markup")
       
   917  // Add a #define for your Tokenizer here.
       
   918 +#define TOKENIZER_PIA ("PIA")
       
   919 +#define TOKENIZER_PIA_NO_MARKUP ("PIA without Markup")
       
   920 
       
   921 
       
   922  //
       
   923 @@ -78,8 +81,23 @@
       
   924      // got "char"
       
   925      return TOKENIZER_CHAR;
       
   926 
       
   927 +  } else if ( ( name[0] == 'p' || name[0] == 'P' ) &&
       
   928 +       ( name[1] == 'i' || name[1] == 'I' ) &&
       
   929 +       ( name[2] == 'a' || name[3] == 'A' ) ) {
       
   930 +
       
   931 +    if ( name[4] == '-' &&
       
   932 +         ( name[5] == 'n' || name[5] == 'N' ) &&
       
   933 +         ( name[5] == 'o' || name[5] == 'O' ) ) {
       
   934 +
       
   935 +      // got "pia-nomarkup"
       
   936 +      return TOKENIZER_PIA_NO_MARKUP;
       
   937 +    }
       
   938 +
       
   939 +    // got "pia"
       
   940 +    return TOKENIZER_PIA;
       
   941    }
       
   942 
       
   943 +
       
   944    return "";
       
   945  }
       
   946 
       
   947 @@ -105,6 +123,14 @@
       
   948 
       
   949      tokenizer = new indri::parse::TextTokenizer( false, false );
       
   950 
       
   951 +  } else if ( preferred == TOKENIZER_PIA ) {
       
   952 +
       
   953 +    tokenizer = new indri::parse::TextTokenizerPIA();
       
   954 +
       
   955 +  } else if ( preferred == TOKENIZER_PIA_NO_MARKUP ) {
       
   956 +
       
   957 +    tokenizer = new indri::parse::TextTokenizerPIA( false );
       
   958 +
       
   959    } else {
       
   960 
       
   961      LEMUR_THROW( LEMUR_RUNTIME_ERROR, name + " is not a known tokenizer." );
       
   962 --- indri-5.4/src/FileClassEnvironmentFactory.cpp	po črc 15 14:40:19 2013
       
   963 +++ indri-5.4/src/FileClassEnvironmentFactory.cpp	po črc 15 14:29:12 2013
       
   964 @@ -189,6 +189,20 @@
       
   965      trec_conflations      // conflations
       
   966    },
       
   967    {
       
   968 +    "trecpia",           // name
       
   969 +    "xml",                // parser
       
   970 +    "pia",               // tokenizer
       
   971 +    "tagged",             // iterator
       
   972 +    "<DOC>",              // startDocTag
       
   973 +    "</DOC>",             // endDocTag
       
   974 +    NULL,                 // endMetadataTag
       
   975 +    trec_include_tags,    // includeTags
       
   976 +    NULL,                 // excludeTags
       
   977 +    trec_index_tags,      // indexTags
       
   978 +    trec_metadata_tags,   // metadataTags
       
   979 +    trec_conflations      // conflations
       
   980 +  },
       
   981 +  {
       
   982      "trecchar",           // name
       
   983      "xml",                // parser
       
   984      "char",               // tokenizer
       
   985 --- indri-5.4/Makefile.app.in	2013-09-04 06:31:06.740210927 -0700
       
   986 +++ indri-5.4/Makefile.app.in	2013-09-04 06:27:24.857989779 -0700
       
   987 @@ -1,22 +1,26 @@
       
   988 +include MakeDefns
       
   989 +
       
   990  ## your application name here
       
   991 -APP=
       
   992 +APP=pia_wrapper
       
   993  SRC=$(APP).cpp
       
   994  ## extra object files for your app here
       
   995  OBJ=
       
   996 +OUTPUT=lib$(APP).so.1
       
   997 
       
   998  prefix = @prefix@
       
   999  exec_prefix = ${prefix}
       
  1000  libdir = @libdir@
       
  1001  includedir = @includedir@
       
  1002 -INCPATH=-I$(includedir)
       
  1003 -LIBPATH=-L$(libdir)
       
  1004 +INCPATH=-Iinclude -Icontrib/lemur/include
       
  1005 +LIBPATH=-Lobj
       
  1006  CXXFLAGS=@DEFS@ @CPPFLAGS@ @CXXFLAGS@ $(INCPATH)
       
  1007 -CPPLDFLAGS  = @LDFLAGS@ -lindri @LIBS@
       
  1008 +CPPLDFLAGS  = @LDFLAGS@ -lnvpair -lindri @LIBS@
       
  1009 
       
  1010  all:
       
  1011 -	$(CXX) $(CXXFLAGS) $(SRC) -o $(APP) $(OBJ) $(LIBPATH) $(CPPLDFLAGS)
       
  1012 +	$(CXX) $(CXXFLAGS) $(SRC) -fpic -shared -static-libgcc -h $(OUTPUT) -o $(OUTPUT) $(OBJ) $(LIBPATH) $(CPPLDFLAGS)
       
  1013 
       
  1014  clean:
       
  1015  	rm -f $(APP)
       
  1016 
       
  1017 -
       
  1018 +install:
       
  1019 +	cp $(OUTPUT) $(libdir)
       
  1020 --- indri-5.4/Makefile	2013-09-12 07:39:16.027125829 -0700
       
  1021 +++ indri-5.4/Makefile	2013-09-12 07:38:44.720450641 -0700
       
  1022 @@ -73,5 +73,6 @@
       
  1023  	$(MAKE) install -C doc
       
  1024  	$(MAKE) -C site-search install
       
  1025  	$(INSTALL_DATA) Makefile.app $(pkgdatadir)
       
  1026 +	$(MAKE) -f Makefile.app install
       
  1027 
       
  1028  test: