upstream/oracle/userland-gate: comparison components/indri/patches/pia.patch

equal deleted inserted replaced

-:b35ca5ff8eee
+:8dee2dfe2525
+Add our PIA wrapper to indri sources. This patch does several things:
+- Add pia wrapper sources to indri source tree
+- Add new tokenizer which does not treat '_' as a separator
+- The TextTokenizerPIA.l differs from TextTokenizer.l only in single character
+-[a-zA-Z0-9']+  { byte_position += tokleng; return ASCII_TOKEN; }
++[a-zA-Z0-9_']+ { byte_position += tokleng; return ASCII_TOKEN; }
+- plus many symbol renames so that the parsers can coexist (toktext -> piatoktext etc.)
+- TextTokenizerPIA.hpp contains only symbol renamse
+- Rest are modifications to make indri build PIA wrapper
+--- indri-5.4/pia_wrapper.cpp	po črc 15 14:30:41 2013
++++ indri-5.4/pia_wrapper.cpp	po črc 15 14:29:09 2013
+@@ -0,0 +1,222 @@
++/*
++ * TO compile :
++ *      g++ -o libpia_wrapper.so -shared -fPIC -I../vlad-libs/sparc/usr/include/ -L../vlad-libs/sparc/usr/lib/ -lclucene-core -lnvpair pia_wrapper.cc
++ *
++ */
++
++#include <sys/stat.h>
++#include <strings.h>
++#include <stdio.h>
++#include <libnvpair.h>
++
++#include <iostream>
++#include <string>
++#include <sstream>
++#include <fstream>
++
++#include <vector>
++#include "indri/QueryEnvironment.hpp"
++#include "indri/SnippetBuilder.hpp"
++#include "indri/Repository.hpp"
++
++using namespace std;
++
++using namespace indri::api;
++
++#define MAX_RESULTS 3
++#define PIA_DATABASE "/var/db/piadb"
++#define PIA_DATABASE_STORAGE PIA_DATABASE "/collection/storage"
++
++indri::collection::Repository repository;
++
++std::string
++getFieldText(int documentID, std::string field) {
++	std::string ret_val = "";
++	indri::collection::Repository::index_state repIndexState = repository.indexes();
++	indri::index::Index *thisIndex=(*repIndexState)[0];
++	int fieldID=thisIndex->field(field);
++
++	if (fieldID < 1) {
++		return "";
++	}
++
++	const indri::index::TermList *termList=thisIndex->termList(documentID);
++
++	if (!termList) {
++		return "";
++	}
++
++	indri::utility::greedy_vector< indri::index::FieldExtent > fieldVec=termList->fields();
++	indri::utility::greedy_vector< indri::index::FieldExtent >::iterator fIter=fieldVec.begin();
++	while (fIter!=fieldVec.end()) {
++
++		if ((*fIter).id==fieldID) {
++			int beginTerm=(*fIter).begin;
++			int endTerm=(*fIter).end;
++
++	        	/*
++	 	 	 * note that the text is inclusive of the beginning
++		         * but exclusive of the ending
++		 	 */
++			for (int t=beginTerm; t < endTerm; t++) {
++				int thisTermID=termList->terms()[t];
++		       		ret_val = ret_val + thisIndex->term(thisTermID) + " ";
++			}
++		}
++
++		fIter++;
++	}
++
++	delete termList;
++	termList=NULL;
++	return ret_val;
++}
++
++/*
++ * Returns NULL on failure
++ * nvlist *
++ * search(
++ *  nvlist_t *search_params,
++ *  char **errmsg            // Similar to pia_index()
++ * );
++ */
++nvlist *
++search (nvlist_t *search_params, char **errmsg) {
++
++	char *index_path = PIA_DATABASE;
++	nvlist_t **nvl_list_result;
++	nvlist_t *nvl_return;
++	nvlist_t *nvl_result;
++	nvlist_t *results = NULL;
++
++	if (nvlist_alloc(&results, NV_UNIQUE_NAME, 0) != 0) {
++		*errmsg = strdup("nvlist_alloc failed\n");
++		return NULL;
++	}
++
++	try {
++		std::string query;
++		char *panicstack;
++		(void) nvlist_lookup_string(search_params, "stack", &panicstack);
++
++		QueryEnvironment indriEnvironment;
++		indriEnvironment.addIndex(index_path);
++
++		/* Create Indri query */
++		query = "#combine (" + std::string(panicstack) + ")";
++
++		QueryAnnotation *QAresults=indriEnvironment.runAnnotatedQuery(query.c_str(), MAX_RESULTS);
++
++		std::vector<indri::api::ScoredExtentResult> resultVector=QAresults->getResults();
++
++		int totalNumResults=resultVector.size();
++
++		/* Get Parsed document of the results */
++		std::vector<ParsedDocument*> parsedDocs=indriEnvironment.documents(resultVector);
++
++		int results_to_return = 0;
++		for ( size_t i=0; i < totalNumResults && i < MAX_RESULTS; i++ ) {
++				results_to_return++;
++		}
++
++		/* Open Repository */
++		repository.openRead(index_path);
++
++		nvl_list_result = (nvlist_t **) malloc(results_to_return * sizeof(nvlist_t *));
++
++		for ( size_t i=0; i < results_to_return; i++ ) {
++
++			std::string ret="";
++
++			int thisResultDocID=resultVector[i].document;
++
++			if (nvlist_alloc(&nvl_list_result[i], NV_UNIQUE_NAME, 0) != 0) {
++				*errmsg = strdup("nvlist_alloc failed\n");
++				return NULL;
++			}
++
++			if ((ret = getFieldText(thisResultDocID, "bug")) == "") {
++				*errmsg = strdup("Lookup of bugid failed\n");
++				return NULL;
++			} else if (nvlist_add_string(nvl_list_result[i], "pia-bugid", ret.c_str())) {
++				*errmsg = strdup("nvlist_add bugid failed\n");
++				return NULL;
++			}
++
++			if ((ret = getFieldText(thisResultDocID, "stack")) == "") {
++				*errmsg = strdup("Lookup of stack failed\n");
++				return NULL;
++			} else if (nvlist_add_string(nvl_list_result[i], "pia-stack", ret.c_str())) {
++				*errmsg = strdup("nvlist_add stack failed\n");
++				return NULL;
++			}
++
++			if ((ret = getFieldText(thisResultDocID, "signature")) == "") {
++				*errmsg = strdup("Lookup of signature failed\n");
++				return NULL;
++			} else if (nvlist_add_string(nvl_list_result[i], "pia-signature", ret.c_str())) {
++				*errmsg = strdup("nvlist_add signature failed\n");
++				return NULL;
++			}
++
++			int indri_score = 1000 + (int)resultVector[i].score*1000;
++			if (nvlist_add_int32(nvl_list_result[i], "pia-score", indri_score)) {
++				*errmsg = strdup("nvlist_add score failed\n");
++				return NULL;
++			}
++		}
++		repository.close();
++
++		nvlist_add_nvlist_array(results, "results", nvl_list_result, results_to_return);
++
++		for (int i=0; i<results_to_return; i++) {
++			nvlist_free(nvl_list_result[i]);
++		}
++
++		return results;
++
++	} catch(...){
++		nvl_list_result = (nvlist_t **) malloc(1 * sizeof(nvlist_t **));
++
++		if (nvlist_alloc(&nvl_result, NV_UNIQUE_NAME, 0) != 0) {
++			*errmsg = strdup("nvlist_alloc failed\n");
++			return NULL;
++		}
++
++		if (nvlist_add_string(nvl_result, "error", "Indri Error")) {
++			*errmsg = strdup("nvlist_add error failed\n");
++			return NULL;
++                }
++
++		nvlist_dup(nvl_result, &nvl_list_result[0], 0);
++		nvlist_free(nvl_result);
++		nvlist_add_nvlist_array(results, "results", nvl_list_result, 1);
++
++		return results;
++        }
++}
++
++extern "C" nvlist*
++pia_search (nvlist_t *search_params, char **errmsg) {
++
++	return search (search_params, errmsg);
++
++}
++
++int
++init () {
++
++	struct stat sb;
++	if (stat(PIA_DATABASE_STORAGE, &sb) != 0) {
++		return 1;
++	}
++
++	return 0;
++}
++
++extern "C" int
++pia_init () {
++
++	return init ();
++
++}
+--- indri-5.4/src/TextTokenizerPIA.l	po črc 15 14:38:12 2013
++++ indri-5.4/src/TextTokenizerPIA.l	po črc 15 14:36:55 2013
+@@ -0,0 +1,588 @@
++%option noyywrap
++%option never-interactive
++%option prefix="piatok"
++
++%{
++
++/*==========================================================================
++ * Copyright (c) 2004 University of Massachusetts.  All Rights Reserved.
++ *
++ * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
++ * is subject to the terms of the software license set forth in the LICENSE
++ * file included with this software, and also available at
++ * http://www.lemurproject.org/license.html
++ *
++ *==========================================================================
++ */
++
++//
++// TextTokenizerPIA
++//
++// 15 September 2005 -- mwb
++//
++
++#include <string.h>
++#include <ctype.h>
++#include "indri/TextTokenizerPIA.hpp"
++#include "indri/TermExtent.hpp"
++#include "indri/TagEvent.hpp"
++#include "indri/TokenizedDocument.hpp"
++#include "indri/UnparsedDocument.hpp"
++#include "indri/UTF8Transcoder.hpp"
++#include "indri/AttributeValuePair.hpp"
++
++static long byte_position;
++
++#define ZAP           1
++#define TAG           2
++#define ASCII_TOKEN   3
++#define UTF8_TOKEN    4
++
++%}
++%start COMMENT
++%%
++
++"<!--" { BEGIN(COMMENT); byte_position += piatokleng; return ZAP; }
++<COMMENT>[^-]+ { byte_position += piatokleng; return ZAP; }
++<COMMENT>"-->" { BEGIN(INITIAL); byte_position += piatokleng; return ZAP; }
++<COMMENT>"-" { byte_position += piatokleng; return ZAP; }
++"<!"[^\>]*">" { byte_position += piatokleng; return ZAP; }
++\<[a-zA-Z/][^\>]*\>                                             { byte_position += piatokleng; return TAG; }
++[&]([a-zA-Z]+|[#]([0-9]+|[xX][a-fA-F0-9]+))[;]         { byte_position += piatokleng; return ZAP; /* symbols */ }
++[A-Z0-9]"."([A-Z0-9]".")*                                        { byte_position += piatokleng; return ASCII_TOKEN; }
++[a-zA-Z0-9_']+                                        { byte_position += piatokleng; return ASCII_TOKEN; }
++"-"[0-9]+("."[0-9]+)?                                  { byte_position += piatokleng; return ASCII_TOKEN; }
++[a-zA-Z0-9\x80-\xFD]+                               { byte_position += piatokleng; return UTF8_TOKEN; }
++
++[\n]                                                   { byte_position += piatokleng; return ZAP; }
++.                                                      { byte_position += piatokleng; return ZAP; }
++
++%%
++
++indri::parse::TokenizedDocument* indri::parse::TextTokenizerPIA::tokenize( indri::parse::UnparsedDocument* document ) {
++
++  _termBuffer.clear();
++  if ( _tokenize_entire_words)
++    _termBuffer.grow( document->textLength * 4);
++  else
++    _termBuffer.grow( document->textLength * 8 ); // extra null per char.
++
++  _document.terms.clear();
++  _document.tags.clear();
++  _document.positions.clear();
++
++  _document.metadata = document->metadata;
++  _document.text = document->text;
++  _document.textLength = document->textLength;
++  _document.content = document->content;
++  _document.contentLength = document->contentLength;
++
++  // byte offset
++  byte_position = document->content - document->text;
++
++  piatok_scan_bytes( document->content, document->contentLength );
++
++  // Main Tokenizer loop
++
++  int type;
++
++  while ( type = piatoklex() ) {
++
++    switch ( type ) {
++
++    case ASCII_TOKEN: processASCIIToken(); break;
++
++    case UTF8_TOKEN: processUTF8Token(); break;
++
++    case TAG: if ( _tokenize_markup ) processTag(); break;
++
++    default:
++    case ZAP:
++      break;
++
++    }
++
++  }
++
++  piatok_delete_buffer( YY_CURRENT_BUFFER );
++
++  return &_document;
++}
++
++// Member functions for processing tokenization events as dispatched
++// from the main tokenizer loop
++
++void indri::parse::TextTokenizerPIA::processTag() {
++
++  // Here, we parse the tag in a fashion that is relatively robust to
++  // malformed markup.  toktext matches this pattern: <[^>]+>
++
++  if ( piatoktext[1] == '?' || piatoktext[1] == '!' ) {
++
++    // XML declaration like <? ... ?> and <!DOCTYPE ... >
++    return; // ignore
++
++  } else if ( piatoktext[1] == '/' ) { // close tag, eg. </FOO>
++
++    // Downcase the tag name.
++
++    int len = 0;
++
++    for ( char *c = piatoktext + 2;
++#ifndef WIN32
++          isalnum( *c ) || *c == '-' || *c == '_' || *c == ':' ; c++ ) {
++#else
++          ((*c >= 0) && isalnum( *c )) || *c == '-' || *c == '_' || *c == ':' ; c++ ) {
++#endif
++
++      *c = tolower( *c );
++      if ( *c == ':' ) *c = '_'; /* replace colon (from namespaces) */
++      len++;
++    }
++
++    TagEvent te;
++
++    te.open_tag = false;
++
++    // We need to write len characters, plus a NULL
++    char* write_loc = _termBuffer.write( len + 1 );
++    strncpy( write_loc, piatoktext + 2, len );
++    write_loc[len] = '\0';
++    te.name = write_loc;
++
++    // token position of tag event w/r/t token string
++    te.pos = _document.terms.size();
++
++    te.begin = byte_position - piatokleng;
++    te.end = byte_position;
++
++    _document.tags.push_back( te );
++
++#ifndef WIN32
++    } else if ( isalpha( piatoktext[1] ) ) {
++#else
++    } else if ( (piatoktext[1]  >= 0) && (isalpha( piatoktext[1] ) )) {
++#endif
++
++    // Try to extract the tag name:
++
++    char* c = piatoktext + 1;
++    int i = 0;
++    int offset = 1; // current offset w/r/t byte_position - piatokleng
++    // it starts at one because it is incremented when c is, and c starts at one.
++    char* write_loc;
++
++#ifndef WIN32
++    while ( isalnum( c[i] ) || c[i] == '-' || c[i] == '_' || c[i] == ':' ) i++;
++#else
++    while ( ( (c[i] >= 0) && isalnum( c[i] )) || c[i] == '-' || c[i] == '_' || c[i] == ':' ) i++;
++#endif
++    if ( c[i] == '>' ) {
++
++      // open tag with no attributes, eg. <title>
++
++      // Ensure tag name is downcased
++      for ( int j = 0; j < i; j++ ) {
++        c[j] = tolower( c[j] );
++        if ( c[j] == ':' ) c[j] = '_'; /* replace colon (from namespaces) */
++      }
++
++      TagEvent te;
++
++      te.open_tag = true;
++
++      // need to write i characters, plus a NULL
++      char* write_loc = _termBuffer.write( i + 1 );
++      strncpy( write_loc, c, i );
++      write_loc[i] = '\0';
++      te.name = write_loc;
++
++      te.pos = _document.terms.size();
++
++      te.begin = byte_position - piatokleng;
++      te.end = byte_position;
++
++      _document.tags.push_back( te );
++
++#ifndef WIN32
++    } else if ( isspace( c[i] ) ) {
++#else
++    } else if ( (c[i]  >= 0) && (isspace( c[i] ) )) {
++#endif
++
++      // open tag with attributes, eg. <A HREF="www.foo.com/bar">
++
++      TagEvent te;
++
++      te.open_tag = true;
++
++      // Ensure tag name is downcased
++      for ( int j = 0; j < i; j++ ) {
++        c[j] = tolower( c[j] );
++        if ( c[j] == ':' ) c[j] = '_'; /* replace colon (from namespaces) */
++      }
++
++      // need to write i characters, plus a NULL
++      char* write_loc = _termBuffer.write( i + 1 );
++      strncpy( write_loc, c, i );
++      write_loc[i] = '\0';
++      te.name = write_loc;
++      c += i;
++      offset += i;
++
++#ifndef WIN32
++    while ( isspace( *c ) ) { c++; offset++; }
++#else
++    while (((*c) >=0) &&  isspace( *c )) { c++; offset++; }
++#endif
++
++      te.pos = _document.terms.size();
++
++      te.begin = byte_position - piatokleng;
++      te.end = byte_position;
++
++      // Now search for attributes:
++
++      while ( *c != '>' && *c != '\0' ) {
++
++        AttributeValuePair avp;
++
++        // Try to extract attribute name:
++
++        i = 0;
++#ifndef WIN32
++        while ( isalnum( c[i] ) || c[i] == '-' || c[i] == '_' ) i++;
++#else
++        while ( (c[i] >= 0) && isalnum( c[i] ) || c[i] == '-' || c[i] == '_') i++;
++#endif
++
++        if ( i == 0 ) break;
++
++        // Ensure attribute name is downcased
++        for ( int j = 0; j < i; j++ )
++          c[j] = tolower( c[j] );
++
++        // need to write i characters, plus a NULL
++        write_loc = _termBuffer.write( i + 1 );
++        strncpy( write_loc, c, i );
++        write_loc[i] = '\0';
++        avp.attribute = write_loc;
++        c += i;
++        offset += i;
++
++        // attributes can be foo\s*=\s*"bar[">] or foo\s*=\s*bar
++
++		// ignore any spaces
++#ifndef WIN32
++    while ( isspace( *c ) ) { c++; offset++; }
++#else
++    while (((*c) >=0) &&  isspace( *c )) { c++; offset++; }
++#endif
++
++        if ( *c == '=' ) {
++
++          c++; // get past the '=' sign.
++          offset++;
++
++#ifndef WIN32
++    while ( isspace( *c ) ) { c++; offset++; }
++#else
++    while (((*c) >=0) &&  isspace( *c )) { c++; offset++; }
++#endif
++
++          if ( *c == '>' ) {
++
++            // common malformed markup <a href=>
++
++            // Insert empty attribute value
++            // need to write a single NULL
++            write_loc = _termBuffer.write( 1 );
++            write_loc[0] = '\0';
++            avp.value = write_loc;
++            avp.begin = byte_position - piatokleng + offset;
++            avp.end = byte_position - piatokleng + offset;
++
++          } else {
++
++            bool quoted = true;
++            char quote_char;
++            if ( *c == '"' || *c =='\'' ) { quote_char = *c; c++; offset++; }
++            else quoted = false;
++
++            // Attribute value starts here.
++
++            i = 0;
++// make sure the opening and closing quote character match...
++            if ( quoted )
++//              while ( c[i] != '"' && c[i] != '>' && c[i] !='\'') i++;
++              while ( c[i] != quote_char && c[i] != '>') i++;
++            else
++#ifndef WIN32
++              while ( ! isspace( c[i] ) && c[i] != '>' ) i++;
++#else
++              while ( ((c[i] >= 0)  && ! isspace( c[i] ) ) && c[i] != '>' ) i++;
++#endif
++
++            // need to write i characters, plus a NULL
++            write_loc = _termBuffer.write( i + 1 );
++            strncpy( write_loc, c, i );
++            write_loc[i] = '\0';
++            avp.value = write_loc;
++            avp.begin = byte_position - piatokleng + offset;
++            avp.end = byte_position - piatokleng + offset + i;
++            c += i;
++            offset += i;
++
++          }
++        } else {
++
++          // Insert empty attribute value
++          // need to write a single NULL
++          write_loc = _termBuffer.write( 1 );
++          write_loc[0] = '\0';
++          avp.value = write_loc;
++          avp.begin = byte_position - piatokleng + offset;
++          avp.end = byte_position - piatokleng + offset;
++        }
++#ifndef WIN32
++        while ( isspace( *c ) || *c == '"' ) { c++; offset++; }
++#else
++        while ( ((*c >= 0) && isspace( *c )) || *c == '"' ) { c++; offset++; }
++#endif
++
++        te.attributes.push_back( avp );
++      }
++
++      _document.tags.push_back( te );
++
++    }
++
++    // One of the cases that is ignored is this common malformed
++    // markup <foo=bar> with no tag name.  Another is the case
++    // of an email address <[email protected]>
++
++
++  }
++}
++
++void indri::parse::TextTokenizerPIA::processUTF8Token() {
++
++  // A UTF-8 token, as recognized by flex, could actually be
++  // a mixed ASCII/UTF-8 string containing any number of
++  // UTF-8 characters, so we re-tokenize it here.
++
++  indri::utility::HashTable<UINT64,const int>& unicode = _transcoder.unicode();
++
++  int len = strlen( piatoktext );
++
++  UINT64* unicode_chars = new UINT64[len + 1];
++  int* offsets = new int[len + 1];
++  int* lengths = new int[len + 1];
++  _transcoder.utf8_decode( piatoktext, &unicode_chars, NULL, NULL,
++                           &offsets, &lengths );
++
++  const int* p;
++  int cls;             // Character class of current UTF-8 character
++  // offset of current UTF-8 character w/r/t toktext stored in offsets[i]
++  // byte length of current UTF-8 character stored in lengths[i]
++
++  int offset = 0;      // Position of start of current *token* (not character) w/r/t toktext
++  int extent = 0;      // Extent for this *token* including trailing punct
++  int piatoken_len = 0;   // Same as above, minus the trailing punctuation
++
++  char buf[64];
++
++  // If this flag is true, we have punctuation symbols at the end of a
++  // token, so do not attach another letter to this token.
++  bool no_letter = false;
++
++  // In case there are malformed characters preceding the good
++  // characters:
++  offset = offsets[0];
++
++  for ( int i = 0; unicode_chars[i] != 0; i++ ) {
++
++    p = unicode.find( unicode_chars[i] );
++    cls = p ? *p : 0;
++
++    if ( ! _tokenize_entire_words ) { // Tokenize by character
++
++      if ( cls != 0 && cls != 3 && cls != 5 && cls != 9 ) {
++
++        writeToken( piatoktext + offsets[i], lengths[i],
++                    byte_position - piatokleng + offsets[i],
++                    byte_position - piatokleng + offsets[i] + lengths[i] );
++      }
++      continue;
++    }
++
++    // If this is not the first time through this loop, we need
++    // to check to see if any bytes in toktext were skipped
++    // during the UTF-8 analysis:
++
++    if ( i != 0 && offset + piatoken_len != offsets[i] ) {
++
++      // Write out the token we are working on, if any:
++
++      if ( piatoken_len > 0 ) {
++
++        writeToken( piatoktext + offset, piatoken_len,
++                    byte_position - piatokleng + offset,
++                    byte_position - piatokleng + offset + extent );
++      }
++
++      extent = 0;
++      piatoken_len = 0;
++      no_letter = false;
++      offset = offsets[i];
++    }
++
++    // Tokenize by word:
++
++    switch ( cls ) {
++
++    case 4: // Currency symbol: always extracted alone
++      // Action: write the token we are working on,
++      // and write this symbol as a separate token
++      writeToken( piatoktext + offset, extent,
++                  byte_position - piatokleng + offset,
++                  byte_position - piatokleng + offset + extent );
++
++      offset += extent;
++
++      writeToken( piatoktext + offset, lengths[i],
++                  byte_position - piatokleng + offset,
++                  byte_position - piatokleng + offset + lengths[i] );
++
++      offset += lengths[i];
++      piatoken_len = 0;
++      extent = 0;
++      no_letter = false;
++      break;
++
++    case 1: // Apostrophe
++    case 10: // Decimal separator
++    case 6: // Letter
++    case 7: // Digit
++      // Action: add this character to the end of the token we are
++      // working on
++      if ( no_letter ) { // This is a token boundary
++        writeToken( piatoktext + offset, piatoken_len,
++                    byte_position - piatokleng + offset,
++                    byte_position - piatokleng + offset + extent );
++
++        offset += extent;
++        extent = 0;
++        piatoken_len = 0;
++        no_letter = false;
++
++      }
++
++      extent += lengths[i];
++      piatoken_len += lengths[i];
++      break;
++
++    case 2: // Percent
++    case 8: // Punctuation
++    case 12: // Thousands separator
++    case 11: // Hyphen
++      // Action: These characters are included in the extent of the
++      // token we are working on.
++      no_letter = true;
++      extent += lengths[i];
++      break;
++
++    case 0: // No character class!
++    case 3: // Control character
++    case 5: // Non-punctuation symbol
++    case 9: // Whitespace
++    default:
++      // Action: write the token we are working on.  Do not include
++      // this character in any future token.
++      writeToken( piatoktext + offset, piatoken_len,
++                  byte_position - piatokleng + offset,
++                  byte_position - piatokleng + offset + extent );
++
++      offset += (extent + lengths[i]); // Include current character
++      extent = 0;
++      piatoken_len = 0;
++      no_letter = false;
++
++      break;
++    }
++  }
++
++  // Write out last token
++  if ( piatoken_len > 0 )
++    writeToken( piatoktext + offset, piatoken_len,
++                byte_position - piatokleng + offset,
++                byte_position - piatokleng + offset + extent );
++
++  delete[] unicode_chars;
++  delete[] offsets;
++  delete[] lengths;
++}
++
++void indri::parse::TextTokenizerPIA::processASCIIToken() {
++
++  int piatoken_len = strlen( piatoktext );
++
++  // token_len here is the length of the token without
++  // any trailing punctuation.
++
++  for ( int i = piatoken_len - 1; i > 0; i-- ) {
++
++    if ( ! ispunct( piatoktext[i] ) )
++      break;
++    else
++      piatoken_len--;
++  }
++
++  if ( _tokenize_entire_words ) {
++
++    writeToken( piatoktext, piatoken_len, byte_position - piatokleng, byte_position );
++
++  } else {
++
++    for ( int i = 0; i < piatoken_len; i++ )
++      writeToken( piatoktext + i, 1, byte_position - piatokleng + i,
++                  byte_position - piatokleng + i + 1 );
++  }
++}
++
++
++// ObjectHandler implementation
++
++void indri::parse::TextTokenizerPIA::handle( indri::parse::UnparsedDocument* document ) {
++
++  _handler->handle( tokenize( document ) );
++}
++
++void indri::parse::TextTokenizerPIA::setHandler( ObjectHandler<indri::parse::TokenizedDocument>& h ) {
++
++  _handler = &h;
++}
++
++void indri::parse::TextTokenizerPIA::writeToken( char* token, int piatoken_len,
++                                              int extent_begin, int extent_end ) {
++
++
++  // The TermExtent for a token will include trailing punctuation.
++  // The purpose for this is that it makes for a nicer display when a
++  // sequence of tokens (say, a sentence) is retrieved and shown to
++  // the user.
++
++  TermExtent extent;
++  extent.begin = extent_begin;
++  extent.end = extent_end;
++  _document.positions.push_back( extent );
++
++  // The terms entry for a token won't include the punctuation.
++
++  char* write_loc = _termBuffer.write( piatoken_len + 1 );
++  strncpy( write_loc, token, piatoken_len );
++  write_loc[piatoken_len] = '\0';
++  _document.terms.push_back( write_loc );
++}
++
++
+--- indri-5.4/include/indri/TextTokenizerPIA.hpp	po črc 15 14:38:50 2013
++++ indri-5.4/include/indri/TextTokenizerPIA.hpp	po črc 15 14:36:54 2013
+@@ -0,0 +1,73 @@
++/*==========================================================================
++ * Copyright (c) 2003-2005 University of Massachusetts.  All Rights Reserved.
++ *
++ * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
++ * is subject to the terms of the software license set forth in the LICENSE
++ * file included with this software, and also available at
++ * http://www.lemurproject.org/license.html
++ *
++ *==========================================================================
++ */
++
++//
++// TextTokenizerPIA
++//
++// 15 September 2005 -- mwb
++//
++
++#ifndef INDRI_TEXTTOKENIZERPIA_HPP
++#define INDRI_TEXTTOKENIZERPIA_HPP
++
++#include <stdio.h>
++#include <string>
++#include <map>
++
++#include "indri/IndriTokenizer.hpp"
++#include "indri/Buffer.hpp"
++#include "indri/TagEvent.hpp"
++#include "indri/UnparsedDocument.hpp"
++#include "indri/TokenizedDocument.hpp"
++#include "indri/UTF8Transcoder.hpp"
++
++namespace indri {
++  namespace parse {
++
++    class TextTokenizerPIA : public Tokenizer {
++
++    public:
++      TextTokenizerPIA( bool tokenize_markup = true, bool tokenize_entire_words = true ) : _handler(0) {
++
++        _tokenize_markup = tokenize_markup;
++        _tokenize_entire_words = tokenize_entire_words;
++      }
++
++      ~TextTokenizerPIA() {}
++
++      TokenizedDocument* tokenize( UnparsedDocument* document );
++
++      void handle( UnparsedDocument* document );
++      void setHandler( ObjectHandler<TokenizedDocument>& h );
++
++    protected:
++      void processASCIIToken();
++      void processUTF8Token();
++      void processTag();
++
++      indri::utility::Buffer _termBuffer;
++      UTF8Transcoder _transcoder;
++
++      bool _tokenize_markup;
++      bool _tokenize_entire_words;
++
++    private:
++      ObjectHandler<TokenizedDocument>* _handler;
++      TokenizedDocument _document;
++
++      void writeToken( char* token, int token_len, int extent_begin,
++                       int extent_end );
++    };
++  }
++}
++
++#endif // INDRI_TEXTTOKENIZERPIA_HPP
++
+--- indri-5.4/src/TokenizerFactory.cpp	po črc 15 14:39:30 2013
++++ indri-5.4/src/TokenizerFactory.cpp	po črc 15 14:29:11 2013
+@@ -22,6 +22,7 @@
+#include "indri/TextTokenizer.hpp"
+// Add an #include for your Tokenizer here.
++#include "indri/TextTokenizerPIA.hpp"
+#define TOKENIZER_WORD ("Word")
+@@ -29,6 +30,8 @@
+#define TOKENIZER_CHAR ("Char")
+#define TOKENIZER_CHAR_NO_MARKUP ("Char without Markup")
+// Add a #define for your Tokenizer here.
++#define TOKENIZER_PIA ("PIA")
++#define TOKENIZER_PIA_NO_MARKUP ("PIA without Markup")
+//
+@@ -78,8 +81,23 @@
+// got "char"
+return TOKENIZER_CHAR;
++  } else if ( ( name[0] == 'p' || name[0] == 'P' ) &&
++       ( name[1] == 'i' || name[1] == 'I' ) &&
++       ( name[2] == 'a' || name[3] == 'A' ) ) {
++
++    if ( name[4] == '-' &&
++         ( name[5] == 'n' || name[5] == 'N' ) &&
++         ( name[5] == 'o' || name[5] == 'O' ) ) {
++
++      // got "pia-nomarkup"
++      return TOKENIZER_PIA_NO_MARKUP;
++    }
++
++    // got "pia"
++    return TOKENIZER_PIA;
+}
++
+return "";
+}
+@@ -105,6 +123,14 @@
+tokenizer = new indri::parse::TextTokenizer( false, false );
++  } else if ( preferred == TOKENIZER_PIA ) {
++
++    tokenizer = new indri::parse::TextTokenizerPIA();
++
++  } else if ( preferred == TOKENIZER_PIA_NO_MARKUP ) {
++
++    tokenizer = new indri::parse::TextTokenizerPIA( false );
++
+} else {
+LEMUR_THROW( LEMUR_RUNTIME_ERROR, name + " is not a known tokenizer." );
+--- indri-5.4/src/FileClassEnvironmentFactory.cpp	po črc 15 14:40:19 2013
++++ indri-5.4/src/FileClassEnvironmentFactory.cpp	po črc 15 14:29:12 2013
+@@ -189,6 +189,20 @@
+trec_conflations      // conflations
+},
+{
++    "trecpia",           // name
++    "xml",                // parser
++    "pia",               // tokenizer
++    "tagged",             // iterator
++    "<DOC>",              // startDocTag
++    "</DOC>",             // endDocTag
++    NULL,                 // endMetadataTag
++    trec_include_tags,    // includeTags
++    NULL,                 // excludeTags
++    trec_index_tags,      // indexTags
++    trec_metadata_tags,   // metadataTags
++    trec_conflations      // conflations
++  },
++  {
+"trecchar",           // name
+"xml",                // parser
+"char",               // tokenizer
+--- indri-5.4/Makefile.app.in	2013-09-04 06:31:06.740210927 -0700
++++ indri-5.4/Makefile.app.in	2013-09-04 06:27:24.857989779 -0700
+@@ -1,22 +1,26 @@
++include MakeDefns
++
+## your application name here
+-APP=
++APP=pia_wrapper
+SRC=$(APP).cpp
+## extra object files for your app here
+OBJ=
++OUTPUT=lib$(APP).so.1
+prefix = @prefix@
+exec_prefix = ${prefix}
+libdir = @libdir@
+includedir = @includedir@
+-INCPATH=-I$(includedir)
+-LIBPATH=-L$(libdir)
++INCPATH=-Iinclude -Icontrib/lemur/include
++LIBPATH=-Lobj
+CXXFLAGS=@DEFS@ @CPPFLAGS@ @CXXFLAGS@ $(INCPATH)
+-CPPLDFLAGS  = @LDFLAGS@ -lindri @LIBS@
++CPPLDFLAGS  = @LDFLAGS@ -lnvpair -lindri @LIBS@
+all:
+-	$(CXX) $(CXXFLAGS) $(SRC) -o $(APP) $(OBJ) $(LIBPATH) $(CPPLDFLAGS)
++	$(CXX) $(CXXFLAGS) $(SRC) -fpic -shared -static-libgcc -h $(OUTPUT) -o $(OUTPUT) $(OBJ) $(LIBPATH) $(CPPLDFLAGS)
+clean:
+	rm -f $(APP)
+-
++install:
++	cp $(OUTPUT) $(libdir)
+--- indri-5.4/Makefile	2013-09-12 07:39:16.027125829 -0700
++++ indri-5.4/Makefile	2013-09-12 07:38:44.720450641 -0700
+@@ -73,5 +73,6 @@
+	$(MAKE) install -C doc
+	$(MAKE) -C site-search install
+	$(INSTALL_DATA) Makefile.app $(pkgdatadir)
++	$(MAKE) -f Makefile.app install
+test: