PSARC/2015/412 Indri removal
authorVladimir Marek <Vladimir.Marek@oracle.com>
Mon, 07 Dec 2015 13:52:39 +0100
changeset 5215 7fe3e5b43e6a
parent 5214 f6336578f3e5
child 5216 fdd262355907
PSARC/2015/412 Indri removal 22321391 Remove indri
components/indri/Makefile
components/indri/indri.p5m
components/indri/patches/64bit.patch
components/indri/patches/bigendian.patch
components/indri/patches/build_also_shared.patch
components/indri/patches/pia.patch
components/indri/patches/remove_xpdf.patch
components/meta-packages/history/history
--- a/components/indri/Makefile	Thu Dec 17 23:00:14 2015 -0800
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,86 +0,0 @@
-#
-# CDDL HEADER START
-#
-# The contents of this file are subject to the terms of the
-# Common Development and Distribution License (the "License").
-# You may not use this file except in compliance with the License.
-#
-# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
-# or http://www.opensolaris.org/os/licensing.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-# When distributing Covered Code, include this CDDL HEADER in each
-# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
-# If applicable, add the following below this CDDL HEADER, with the
-# fields enclosed by brackets "[]" replaced with your own identifying
-# information: Portions Copyright [yyyy] [name of copyright owner]
-#
-# CDDL HEADER END
-#
-
-#
-# Copyright (c) 2014, 2015, Oracle and/or its affiliates. All rights reserved.
-#
-
-COMPILER			 = gcc
-
-include ../../make-rules/shared-macros.mk
-
-COMPONENT_NAME		= indri
-COMPONENT_VERSION	= 5.4
-COMPONENT_PROJECT_URL	= http://lemurproject.org/indri.php
-COMPONENT_SRC		= $(COMPONENT_NAME)-$(COMPONENT_VERSION)
-COMPONENT_ARCHIVE 	= $(COMPONENT_SRC).tar.gz
-COMPONENT_ARCHIVE_HASH	= \
-	sha256:b1d27f6da4cb15776cee0121c9511ed0e998d47564d785a2bb41a44c654e3e3f
-COMPONENT_ARCHIVE_URL	= http://sourceforge.net/projects/lemur/files/lemur/$(COMPONENT_SRC)/$(COMPONENT_SRC).tar.gz/download
-COMPONENT_BUGDB		= library/libindri
-
-TPNO=			13668
-
-include $(WS_MAKE_RULES)/prep.mk
-include $(WS_MAKE_RULES)/configure.mk
-include $(WS_MAKE_RULES)/ips.mk
-
-COMPONENT_PRE_CONFIGURE_ACTION	 = $(CLONEY) $(SOURCE_DIR) $(@D)
-CONFIGURE_OPTIONS		+= "CFLAGS=$(CFLAGS)"
-CONFIGURE_OPTIONS		+= "CXXFLAGS=$(CXXFLAGS)"
-CONFIGURE_OPTIONS		+= "LDFLAGS=$(LDFLAGS)"
-COMPONENT_INSTALL_ARGS		 = "includedir=$(PROTOUSRINCDIR)"
-COMPONENT_INSTALL_ARGS		+= "datarootdir=$(PROTOUSRSHAREDIR)"
-
-$(BUILD_DIR_32)/.installed: COMPONENT_INSTALL_ARGS += "bindir=$(PROTOUSRBINDIR)"
-$(BUILD_DIR_32)/.installed: COMPONENT_INSTALL_ARGS += "libdir=$(PROTOUSRLIBDIR)"
-$(BUILD_DIR_64)/.installed: COMPONENT_INSTALL_ARGS += "bindir=$(PROTOUSRBINDIR64)"
-$(BUILD_DIR_64)/.installed: COMPONENT_INSTALL_ARGS += "libdir=$(PROTOUSRLIBDIR64)"
-# Because of 18041236
-$(BUILD_DIR_64)/.built:     LD_B_DIRECT =
-
-COMPONENT_POST_BUILD_ACTION = \
-	(cd $(@D) ; $(ENV) $(COMPONENT_BUILD_ENV) \
-		$(GMAKE) -f Makefile.app)
-
-GPATCH_FLAGS += -E # remove empty files
-
-ASLR_MODE = $(ASLR_ENABLE)
-
-# common targets
-configure:	$(CONFIGURE_32_and_64)
-
-build:          $(BUILD_32_and_64)
-
-install:        $(INSTALL_32_and_64)
-
-test:           $(NO_TESTS)
-
-system-test:    $(NO_TESTS)
-
-
-REQUIRED_PACKAGES += developer/lexer/flex
-REQUIRED_PACKAGES += developer/parser/bison
-REQUIRED_PACKAGES += library/zlib
-REQUIRED_PACKAGES += system/library
-REQUIRED_PACKAGES += system/library/gcc/gcc-c-runtime
-REQUIRED_PACKAGES += system/library/gcc/gcc-c++-runtime
-REQUIRED_PACKAGES += system/library/math
--- a/components/indri/indri.p5m	Thu Dec 17 23:00:14 2015 -0800
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,324 +0,0 @@
-#
-# CDDL HEADER START
-#
-# The contents of this file are subject to the terms of the
-# Common Development and Distribution License (the "License").
-# You may not use this file except in compliance with the License.
-#
-# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
-# or http://www.opensolaris.org/os/licensing.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-# When distributing Covered Code, include this CDDL HEADER in each
-# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
-# If applicable, add the following below this CDDL HEADER, with the
-# fields enclosed by brackets "[]" replaced with your own identifying
-# information: Portions Copyright [yyyy] [name of copyright owner]
-#
-# CDDL HEADER END
-#
-# Copyright (c) 2014, 2015, Oracle and/or its affiliates. All rights reserved.
-#
-
-set name=pkg.fmri \
-    value=pkg:/library/[email protected]$(IPS_COMPONENT_VERSION),$(BUILD_VERSION)
-set name=pkg.summary value="Text search engine"
-set name=pkg.description \
-    value="Indri is a search engine that provides state-of-the-art text search and a rich structured query language for text collections of up to 50 million documents (single machine) or 500 million documents (distributed search). Available for Linux, Solaris, Windows and Mac OSX."
-set name=com.oracle.info.description value="the Indri search engine"
-set name=com.oracle.info.tpno value=$(TPNO)
-set name=info.classification \
-    value=org.opensolaris.category.2008:System/Libraries
-set name=info.source-url value=$(COMPONENT_ARCHIVE_URL)
-set name=info.upstream-url value=$(COMPONENT_PROJECT_URL)
-set name=org.opensolaris.arc-caseid value=PSARC/2013/232
-set name=org.opensolaris.consolidation value=$(CONSOLIDATION)
-
-# Clashes with userland antlr
-# dir  path=usr/include/antlr
-# ...
-
-# usr/include
-dir  path=usr/include/indri
-file path=usr/include/indri/AnchorTextAnnotator.hpp
-file path=usr/include/indri/AnchorTextHarvester.hpp
-file path=usr/include/indri/AnchorTextWriter.hpp
-file path=usr/include/indri/Annotator.hpp
-file path=usr/include/indri/Appliers.hpp
-file path=usr/include/indri/ArabicStemmerTransformation.hpp
-file path=usr/include/indri/Arabic_Stemmer_utf8.hpp
-file path=usr/include/indri/AttributeValuePair.hpp
-file path=usr/include/indri/BeliefNode.hpp
-file path=usr/include/indri/BooleanAndNode.hpp
-file path=usr/include/indri/Buffer.hpp
-file path=usr/include/indri/BulkTree.hpp
-file path=usr/include/indri/CachedFrequencyBeliefNode.hpp
-file path=usr/include/indri/Collection.hpp
-file path=usr/include/indri/CombinedVocabularyIterator.hpp
-file path=usr/include/indri/Combiner.hpp
-file path=usr/include/indri/CompressedCollection.hpp
-file path=usr/include/indri/ConditionVariable.hpp
-file path=usr/include/indri/Conflater.hpp
-file path=usr/include/indri/ConflationPattern.hpp
-file path=usr/include/indri/ContextCountAccumulator.hpp
-file path=usr/include/indri/ContextCountGraphCopier.hpp
-file path=usr/include/indri/ContextCountGraphExtractor.hpp
-file path=usr/include/indri/ContextInclusionAndNode.hpp
-file path=usr/include/indri/ContextSimpleCountAccumulator.hpp
-file path=usr/include/indri/ContextSimpleCountCollectorCopier.hpp
-file path=usr/include/indri/Copier.hpp
-file path=usr/include/indri/CorpusStatistics.hpp
-file path=usr/include/indri/DagCopier.hpp
-file path=usr/include/indri/DateFieldAnnotator.hpp
-file path=usr/include/indri/DateParse.hpp
-file path=usr/include/indri/DeletedDocumentList.hpp
-file path=usr/include/indri/DirectoryIterator.hpp
-file path=usr/include/indri/DirichletTermScoreFunction.hpp
-file path=usr/include/indri/DiskDocExtentListIterator.hpp
-file path=usr/include/indri/DiskDocListFileIterator.hpp
-file path=usr/include/indri/DiskDocListIterator.hpp
-file path=usr/include/indri/DiskDocumentDataIterator.hpp
-file path=usr/include/indri/DiskFrequentVocabularyIterator.hpp
-file path=usr/include/indri/DiskIndex.hpp
-file path=usr/include/indri/DiskKeyfileVocabularyIterator.hpp
-file path=usr/include/indri/DiskTermData.hpp
-file path=usr/include/indri/DiskTermListFileIterator.hpp
-file path=usr/include/indri/DocExtentListIterator.hpp
-file path=usr/include/indri/DocExtentListMemoryBuilder.hpp
-file path=usr/include/indri/DocListFileIterator.hpp
-file path=usr/include/indri/DocListIterator.hpp
-file path=usr/include/indri/DocListIteratorNode.hpp
-file path=usr/include/indri/DocListMemoryBuilder.hpp
-file path=usr/include/indri/DocumentCount.hpp
-file path=usr/include/indri/DocumentData.hpp
-file path=usr/include/indri/DocumentDataIterator.hpp
-file path=usr/include/indri/DocumentIterator.hpp
-file path=usr/include/indri/DocumentIteratorFactory.hpp
-file path=usr/include/indri/DocumentStructure.hpp
-file path=usr/include/indri/DocumentStructureHolderNode.hpp
-file path=usr/include/indri/DocumentVector.hpp
-file path=usr/include/indri/EvaluatorNode.hpp
-file path=usr/include/indri/Extent.hpp
-file path=usr/include/indri/ExtentAndNode.hpp
-file path=usr/include/indri/ExtentChildNode.hpp
-file path=usr/include/indri/ExtentDescendantNode.hpp
-file path=usr/include/indri/ExtentEnforcementNode.hpp
-file path=usr/include/indri/ExtentInsideNode.hpp
-file path=usr/include/indri/ExtentOrNode.hpp
-file path=usr/include/indri/ExtentParentNode.hpp
-file path=usr/include/indri/ExtentRestrictionModelAnnotatorCopier.hpp
-file path=usr/include/indri/ExtentRestrictionNode.hpp
-file path=usr/include/indri/FieldBelowWalker.hpp
-file path=usr/include/indri/FieldBetweenNode.hpp
-file path=usr/include/indri/FieldEqualsNode.hpp
-file path=usr/include/indri/FieldExtent.hpp
-file path=usr/include/indri/FieldGreaterNode.hpp
-file path=usr/include/indri/FieldIteratorNode.hpp
-file path=usr/include/indri/FieldLessNode.hpp
-file path=usr/include/indri/FieldListIterator.hpp
-file path=usr/include/indri/FieldStatistics.hpp
-file path=usr/include/indri/FieldWildcardNode.hpp
-file path=usr/include/indri/File.hpp
-file path=usr/include/indri/FileClassEnvironment.hpp
-file path=usr/include/indri/FileClassEnvironmentFactory.hpp
-file path=usr/include/indri/FileTreeIterator.hpp
-file path=usr/include/indri/FilterNode.hpp
-file path=usr/include/indri/FilterRejectNode.hpp
-file path=usr/include/indri/FilterRequireNode.hpp
-file path=usr/include/indri/FixedPassageNode.hpp
-file path=usr/include/indri/FrequencyListCopier.hpp
-file path=usr/include/indri/HTMLParser.hpp
-file path=usr/include/indri/HashTable.hpp
-file path=usr/include/indri/Index.hpp
-file path=usr/include/indri/IndexEnvironment.hpp
-file path=usr/include/indri/IndexWriter.hpp
-file path=usr/include/indri/IndriParser.hpp
-file path=usr/include/indri/IndriTimer.hpp
-file path=usr/include/indri/IndriTokenizer.hpp
-file path=usr/include/indri/InferenceNetwork.hpp
-file path=usr/include/indri/InferenceNetworkBuilder.hpp
-file path=usr/include/indri/InferenceNetworkNode.hpp
-file path=usr/include/indri/InternalFileBuffer.hpp
-file path=usr/include/indri/JelinekMercerTermScoreFunction.hpp
-file path=usr/include/indri/KrovetzStemmer.hpp
-file path=usr/include/indri/KrovetzStemmerTransformation.hpp
-file path=usr/include/indri/LengthPriorNode.hpp
-file path=usr/include/indri/ListAccumulator.hpp
-file path=usr/include/indri/ListBeliefNode.hpp
-file path=usr/include/indri/ListCache.hpp
-file path=usr/include/indri/ListIteratorNode.hpp
-file path=usr/include/indri/LocalQueryServer.hpp
-file path=usr/include/indri/Lockable.hpp
-file path=usr/include/indri/MaxNode.hpp
-file path=usr/include/indri/MboxDocumentIterator.hpp
-file path=usr/include/indri/MemoryDocumentDataIterator.hpp
-file path=usr/include/indri/MemoryIndex.hpp
-file path=usr/include/indri/MemoryIndexDocListFileIterator.hpp
-file path=usr/include/indri/MemoryIndexTermListFileIterator.hpp
-file path=usr/include/indri/MemoryIndexVocabularyIterator.hpp
-file path=usr/include/indri/MetadataPair.hpp
-file path=usr/include/indri/Mutex.hpp
-file path=usr/include/indri/NestedExtentInsideNode.hpp
-file path=usr/include/indri/NestedListBeliefNode.hpp
-file path=usr/include/indri/NetworkListener.hpp
-file path=usr/include/indri/NetworkMessageStream.hpp
-file path=usr/include/indri/NetworkServerProxy.hpp
-file path=usr/include/indri/NetworkServerStub.hpp
-file path=usr/include/indri/NetworkStream.hpp
-file path=usr/include/indri/NexiLexer.hpp
-file path=usr/include/indri/NexiLexerTokenTypes.hpp
-file path=usr/include/indri/NexiParser.hpp
-file path=usr/include/indri/NormalDistribution.hpp
-file path=usr/include/indri/NormalizationTransformation.hpp
-file path=usr/include/indri/NotNode.hpp
-file path=usr/include/indri/NullListNode.hpp
-file path=usr/include/indri/NullScorerNode.hpp
-file path=usr/include/indri/NumericFieldAnnotator.hpp
-file path=usr/include/indri/ObjectHandler.hpp
-file path=usr/include/indri/OfficeHelper.hpp
-file path=usr/include/indri/OffsetAnnotationAnnotator.hpp
-file path=usr/include/indri/OffsetMetadataAnnotator.hpp
-file path=usr/include/indri/OrNode.hpp
-file path=usr/include/indri/OrderedWindowNode.hpp
-file path=usr/include/indri/Packer.hpp
-file path=usr/include/indri/PageRank.hpp
-file path=usr/include/indri/Parameters.hpp
-file path=usr/include/indri/ParsedDocument.hpp
-file path=usr/include/indri/ParserFactory.hpp
-file path=usr/include/indri/Path.hpp
-file path=usr/include/indri/PlusNode.hpp
-file path=usr/include/indri/PonteExpander.hpp
-file path=usr/include/indri/PorterStemmerTransformation.hpp
-file path=usr/include/indri/Porter_Stemmer.hpp
-file path=usr/include/indri/PowerPointDocumentExtractor.hpp
-file path=usr/include/indri/PriorFactory.hpp
-file path=usr/include/indri/PriorListIterator.hpp
-file path=usr/include/indri/PriorNode.hpp
-file path=usr/include/indri/QueryAnnotation.hpp
-file path=usr/include/indri/QueryEnvironment.hpp
-file path=usr/include/indri/QueryExpander.hpp
-file path=usr/include/indri/QueryLexer.hpp
-file path=usr/include/indri/QueryLexerTokenTypes.hpp
-file path=usr/include/indri/QueryParser.hpp
-file path=usr/include/indri/QueryParserFactory.hpp
-file path=usr/include/indri/QueryResponsePacker.hpp
-file path=usr/include/indri/QueryResponseUnpacker.hpp
-file path=usr/include/indri/QueryServer.hpp
-file path=usr/include/indri/QuerySpec.hpp
-file path=usr/include/indri/QueryStopper.hpp
-file path=usr/include/indri/QueryTFWalker.hpp
-file path=usr/include/indri/RMExpander.hpp
-file path=usr/include/indri/RVLCompressStream.hpp
-file path=usr/include/indri/RVLDecompressStream.hpp
-file path=usr/include/indri/RawScorerNodeExtractor.hpp
-file path=usr/include/indri/RawTextParser.hpp
-file path=usr/include/indri/ReaderLockable.hpp
-file path=usr/include/indri/ReadersWritersLock.hpp
-file path=usr/include/indri/ReformulateQuery.hpp
-file path=usr/include/indri/RegionAllocator.hpp
-file path=usr/include/indri/RelevanceModel.hpp
-file path=usr/include/indri/Repository.hpp
-file path=usr/include/indri/RepositoryLoadThread.hpp
-file path=usr/include/indri/RepositoryMaintenanceThread.hpp
-file path=usr/include/indri/ScopedLock.hpp
-file path=usr/include/indri/ScopedMonitor.hpp
-file path=usr/include/indri/ScoredExtentAccumulator.hpp
-file path=usr/include/indri/ScoredExtentResult.hpp
-file path=usr/include/indri/SequentialReadBuffer.hpp
-file path=usr/include/indri/SequentialWriteBuffer.hpp
-file path=usr/include/indri/ShrinkageBeliefNode.hpp
-file path=usr/include/indri/SimpleCopier.hpp
-file path=usr/include/indri/SkippingCapableNode.hpp
-file path=usr/include/indri/SmoothingAnnotatorWalker.hpp
-file path=usr/include/indri/SnippetBuilder.hpp
-file path=usr/include/indri/StemmerFactory.hpp
-file path=usr/include/indri/StopStructureRemover.hpp
-file path=usr/include/indri/StopperTransformation.hpp
-file path=usr/include/indri/SumNode.hpp
-file path=usr/include/indri/TFIDFExpander.hpp
-file path=usr/include/indri/TFIDFTermScoreFunction.hpp
-file path=usr/include/indri/Tag.hpp
-file path=usr/include/indri/TagDocumentIterator.hpp
-file path=usr/include/indri/TagEvent.hpp
-file path=usr/include/indri/TagExtent.hpp
-file path=usr/include/indri/TagList.hpp
-file path=usr/include/indri/TaggedDocumentIterator.hpp
-file path=usr/include/indri/TaggedTextParser.hpp
-file path=usr/include/indri/TermBitmap.hpp
-file path=usr/include/indri/TermData.hpp
-file path=usr/include/indri/TermExtent.hpp
-file path=usr/include/indri/TermFieldStatistics.hpp
-file path=usr/include/indri/TermFrequencyBeliefNode.hpp
-file path=usr/include/indri/TermList.hpp
-file path=usr/include/indri/TermListFileIterator.hpp
-file path=usr/include/indri/TermRecorder.hpp
-file path=usr/include/indri/TermScoreFunction.hpp
-file path=usr/include/indri/TermScoreFunctionFactory.hpp
-file path=usr/include/indri/TermTranslator.hpp
-file path=usr/include/indri/TextDocumentExtractor.hpp
-file path=usr/include/indri/TextParser.hpp
-file path=usr/include/indri/TextTokenizer.hpp
-file path=usr/include/indri/TextTokenizerPIA.hpp
-file path=usr/include/indri/Thread.hpp
-file path=usr/include/indri/TokenizedDocument.hpp
-file path=usr/include/indri/TokenizerFactory.hpp
-file path=usr/include/indri/Transformation.hpp
-file path=usr/include/indri/TreePrinterWalker.hpp
-file path=usr/include/indri/TwoStageTermScoreFunction.hpp
-file path=usr/include/indri/URLTextAnnotator.hpp
-file path=usr/include/indri/UTF8CaseNormalizationTransformation.hpp
-file path=usr/include/indri/UTF8Transcoder.hpp
-file path=usr/include/indri/UnnecessaryNodeRemoverCopier.hpp
-file path=usr/include/indri/UnorderedWindowNode.hpp
-file path=usr/include/indri/Unpacker.hpp
-file path=usr/include/indri/UnparsedDocument.hpp
-file path=usr/include/indri/UtilityThread.hpp
-file path=usr/include/indri/VocabularyIterator.hpp
-file path=usr/include/indri/WARCDocumentIterator.hpp
-file path=usr/include/indri/WPlusNode.hpp
-file path=usr/include/indri/Walker.hpp
-file path=usr/include/indri/WeightFoldingCopier.hpp
-file path=usr/include/indri/WeightedAndNode.hpp
-file path=usr/include/indri/WeightedExtentOrNode.hpp
-file path=usr/include/indri/WeightedSumNode.hpp
-file path=usr/include/indri/WordDocumentExtractor.hpp
-file path=usr/include/indri/WriterLockable.hpp
-file path=usr/include/indri/XMLNode.hpp
-file path=usr/include/indri/XMLReader.hpp
-file path=usr/include/indri/XMLWriter.hpp
-file path=usr/include/indri/atomic.hpp
-file path=usr/include/indri/count_iterator
-file path=usr/include/indri/delete_range.hpp
-file path=usr/include/indri/greedy_vector
-file path=usr/include/indri/indri-platform.h
-file path=usr/include/indri/ref_ptr.hpp
-file path=usr/include/indri/uint64comp.hpp
-
-# usr/lib
-link path=usr/lib/$(MACH64)/libindri.so target=libindri.so.1
-file usr/lib/$(MACH64)/libindri.so.1 path=usr/lib/$(MACH64)/libindri.so.1
-link path=usr/lib/$(MACH64)/libpia_wrapper.so target=libpia_wrapper.so.1
-file usr/lib/$(MACH64)/libpia_wrapper.so.1 \
-    path=usr/lib/$(MACH64)/libpia_wrapper.so.1
-
-# usr/lib/indri directory does not get applied usr/bin defaults, so we have to
-# name them here
-file usr/bin/$(MACH64)/IndriBuildIndex path=usr/lib/indri/IndriBuildIndex \
-    owner=root group=bin mode=0555
-file usr/bin/$(MACH64)/IndriRunQuery path=usr/lib/indri/IndriRunQuery \
-    owner=root group=bin mode=0555
-link path=usr/lib/libindri.so target=libindri.so.1
-file usr/lib/libindri.so.1 path=usr/lib/libindri.so.1
-link path=usr/lib/libpia_wrapper.so target=libpia_wrapper.so.1
-file usr/lib/libpia_wrapper.so.1 path=usr/lib/libpia_wrapper.so.1
-
-# LICENSE.txt is taken from indri sources
-license LICENSE.txt license="Indri license"
-
-# not relevant for our usage as it is obsoleted
-# http://lemurproject.org/lemur.php
-# dir  path=usr/include/lemur
-
-# not relevant for our usage
-# dir  path=usr/share/indri
--- a/components/indri/patches/64bit.patch	Thu Dec 17 23:00:14 2015 -0800
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,13 +0,0 @@
-Add support for 64bit build
-
---- indri-5.4/site-search/cgi/Makefile	2013-09-17 05:38:33.511459071 -0700
-+++ indri-5.4/site-search/cgi/Makefile	2013-09-17 05:37:32.444587645 -0700
[email protected]@ -27,7 +27,7 @@
- all: $(PROG)
- 
- $(PROG): $(OBJS) $(LIBDEPS)
--	$(CXX)  -o [email protected] $(OBJS) $(LDFLAGS)
-+	$(CXX) $(CXXFLAGS) -o [email protected] $(OBJS) $(LDFLAGS)
- 
- clean:
- 	rm -f $(PROG) $(OBJS)
--- a/components/indri/patches/bigendian.patch	Thu Dec 17 23:00:14 2015 -0800
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,12 +0,0 @@
-Indri tries to define htonll and ntohll function which clash with solaris definitions
---- indri-5.4/contrib/lemur/include/lemur/lemur-compat.hpp	2013-10-08 00:25:53.513213629 +0200
-+++ indri-5.4/contrib/lemur/include/lemur/lemur-compat.hpp	2013-10-08 00:25:34.165330285 +0200
[email protected]@ -212,6 +212,8 @@
- #endif 
- 
- #if defined(WORDS_BIGENDIAN)
-+#undef htonll
-+#undef ntohll
- inline UINT64 htonll( UINT64 native ) {
-   return native;
- }
--- a/components/indri/patches/build_also_shared.patch	Thu Dec 17 23:00:14 2015 -0800
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,19 +0,0 @@
-Add support for building shared libraries
---- indri-5.4/src/Makefile	2013-09-04 06:16:24.212280233 -0700
-+++ indri-5.4/src/Makefile	2013-09-04 06:15:35.997458620 -0700
[email protected]@ -17,6 +17,7 @@
- # how to make a library from object files
- $(OBJLIB): $(SPECHEADERS) $(OBJ) $(SPECOBJS)
- 	rm -f [email protected]; $(AR) -rs [email protected] $(OBJ)
-+	$(CC) $(CFLAGS) -shared -o $(OBJLIB:.a=.so).1 -h $(OBJLIB:.a=.so).1 $(OBJ) -Wl,-z -Wl,allextract ../contrib/lemur/obj/liblemur.a -Wl,-z -Wl,allextract ../contrib/antlr/obj/libantlr.a
- 
- #check this.
- $(SPECHEADERS): $(QUERYSPEC)
[email protected]@ -40,6 +41,7 @@
- install:
- 	$(INSTALL_DATA) $(ALLHEADER) $(pkgincludedir)
- 	$(AR) -rs $(libdir)/$(INDRILIB) $(OBJ)
-+	cp $(OBJLIB:.a=.so).1 $(libdir)
- 
- stamp:
- 	awk -f ../src/version-stamper "stamp=`date`" ../include/indri/indri-platform.h > ../include/indri/indri-platform.h2
--- a/components/indri/patches/pia.patch	Thu Dec 17 23:00:14 2015 -0800
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,1028 +0,0 @@
-Add our PIA wrapper to indri sources. This patch does several things:
- - Add pia wrapper sources to indri source tree
- - Add new tokenizer which does not treat '_' as a separator
-   - The TextTokenizerPIA.l differs from TextTokenizer.l only in single character
-      -[a-zA-Z0-9']+  { byte_position += tokleng; return ASCII_TOKEN; }
-      +[a-zA-Z0-9_']+ { byte_position += tokleng; return ASCII_TOKEN; }
-   - plus many symbol renames so that the parsers can coexist (toktext -> piatoktext etc.)
-   - TextTokenizerPIA.hpp contains only symbol renamse
- - Rest are modifications to make indri build PIA wrapper
-
-
---- indri-5.4/pia_wrapper.cpp	po črc 15 14:30:41 2013
-+++ indri-5.4/pia_wrapper.cpp	po črc 15 14:29:09 2013
[email protected]@ -0,0 +1,222 @@
-+/*
-+ * TO compile :
-+ *      g++ -o libpia_wrapper.so -shared -fPIC -I../vlad-libs/sparc/usr/include/ -L../vlad-libs/sparc/usr/lib/ -lclucene-core -lnvpair pia_wrapper.cc
-+ *
-+ */
-+
-+#include <sys/stat.h>
-+#include <strings.h>
-+#include <stdio.h>
-+#include <libnvpair.h>
-+
-+#include <iostream>
-+#include <string>
-+#include <sstream>
-+#include <fstream>
-+
-+#include <vector>
-+#include "indri/QueryEnvironment.hpp"
-+#include "indri/SnippetBuilder.hpp"
-+#include "indri/Repository.hpp"
-+
-+using namespace std;
-+
-+using namespace indri::api;
-+
-+#define MAX_RESULTS 3
-+#define PIA_DATABASE "/var/db/piadb"
-+#define PIA_DATABASE_STORAGE PIA_DATABASE "/collection/storage"
-+
-+indri::collection::Repository repository;
-+
-+std::string
-+getFieldText(int documentID, std::string field) {
-+	std::string ret_val = "";
-+	indri::collection::Repository::index_state repIndexState = repository.indexes();
-+	indri::index::Index *thisIndex=(*repIndexState)[0];
-+	int fieldID=thisIndex->field(field);
-+
-+	if (fieldID < 1) {
-+		return "";
-+	}
-+
-+	const indri::index::TermList *termList=thisIndex->termList(documentID);
-+
-+	if (!termList) {
-+		return "";
-+	}
-+
-+	indri::utility::greedy_vector< indri::index::FieldExtent > fieldVec=termList->fields();
-+	indri::utility::greedy_vector< indri::index::FieldExtent >::iterator fIter=fieldVec.begin();
-+	while (fIter!=fieldVec.end()) {
-+
-+		if ((*fIter).id==fieldID) {
-+			int beginTerm=(*fIter).begin;
-+			int endTerm=(*fIter).end;
-+
-+	        	/*
-+	 	 	 * note that the text is inclusive of the beginning
-+		         * but exclusive of the ending
-+		 	 */
-+			for (int t=beginTerm; t < endTerm; t++) {
-+				int thisTermID=termList->terms()[t];
-+		       		ret_val = ret_val + thisIndex->term(thisTermID) + " ";
-+			}
-+		}
-+
-+		fIter++;
-+	}
-+
-+	delete termList;
-+	termList=NULL;
-+	return ret_val;
-+}
-+
-+/*
-+ * Returns NULL on failure
-+ * nvlist *
-+ * search(
-+ *  nvlist_t *search_params,
-+ *  char **errmsg            // Similar to pia_index()
-+ * );
-+ */
-+nvlist *
-+search (nvlist_t *search_params, char **errmsg) {
-+
-+	char *index_path = PIA_DATABASE;
-+	nvlist_t **nvl_list_result;
-+	nvlist_t *nvl_return;
-+	nvlist_t *nvl_result;
-+	nvlist_t *results = NULL;
-+
-+	if (nvlist_alloc(&results, NV_UNIQUE_NAME, 0) != 0) {
-+		*errmsg = strdup("nvlist_alloc failed\n");
-+		return NULL;
-+	}
-+
-+	try {
-+		std::string query;
-+		char *panicstack;
-+		(void) nvlist_lookup_string(search_params, "stack", &panicstack);
-+
-+		QueryEnvironment indriEnvironment;
-+		indriEnvironment.addIndex(index_path);
-+
-+		/* Create Indri query */
-+		query = "#combine (" + std::string(panicstack) + ")";
-+
-+		QueryAnnotation *QAresults=indriEnvironment.runAnnotatedQuery(query.c_str(), MAX_RESULTS);
-+
-+		std::vector<indri::api::ScoredExtentResult> resultVector=QAresults->getResults();
-+
-+		int totalNumResults=resultVector.size();
-+
-+		/* Get Parsed document of the results */
-+		std::vector<ParsedDocument*> parsedDocs=indriEnvironment.documents(resultVector);
-+
-+		int results_to_return = 0;
-+		for ( size_t i=0; i < totalNumResults && i < MAX_RESULTS; i++ ) {
-+				results_to_return++;
-+		}
-+
-+		/* Open Repository */
-+		repository.openRead(index_path);
-+
-+		nvl_list_result = (nvlist_t **) malloc(results_to_return * sizeof(nvlist_t *));
-+
-+		for ( size_t i=0; i < results_to_return; i++ ) {
-+
-+			std::string ret="";
-+
-+			int thisResultDocID=resultVector[i].document;
-+
-+			if (nvlist_alloc(&nvl_list_result[i], NV_UNIQUE_NAME, 0) != 0) {
-+				*errmsg = strdup("nvlist_alloc failed\n");
-+				return NULL;
-+			}
-+
-+			if ((ret = getFieldText(thisResultDocID, "bug")) == "") {
-+				*errmsg = strdup("Lookup of bugid failed\n");
-+				return NULL;
-+			} else if (nvlist_add_string(nvl_list_result[i], "pia-bugid", ret.c_str())) {
-+				*errmsg = strdup("nvlist_add bugid failed\n");
-+				return NULL;
-+			}
-+
-+			if ((ret = getFieldText(thisResultDocID, "stack")) == "") {
-+				*errmsg = strdup("Lookup of stack failed\n");
-+				return NULL;
-+			} else if (nvlist_add_string(nvl_list_result[i], "pia-stack", ret.c_str())) {
-+				*errmsg = strdup("nvlist_add stack failed\n");
-+				return NULL;
-+			}
-+
-+			if ((ret = getFieldText(thisResultDocID, "signature")) == "") {
-+				*errmsg = strdup("Lookup of signature failed\n");
-+				return NULL;
-+			} else if (nvlist_add_string(nvl_list_result[i], "pia-signature", ret.c_str())) {
-+				*errmsg = strdup("nvlist_add signature failed\n");
-+				return NULL;
-+			}
-+
-+			int indri_score = 1000 + (int)resultVector[i].score*1000;
-+			if (nvlist_add_int32(nvl_list_result[i], "pia-score", indri_score)) {
-+				*errmsg = strdup("nvlist_add score failed\n");
-+				return NULL;
-+			}
-+		}
-+		repository.close();
-+
-+		nvlist_add_nvlist_array(results, "results", nvl_list_result, results_to_return);
-+
-+		for (int i=0; i<results_to_return; i++) {
-+			nvlist_free(nvl_list_result[i]);
-+		}
-+
-+		return results;
-+
-+	} catch(...){
-+		nvl_list_result = (nvlist_t **) malloc(1 * sizeof(nvlist_t **));
-+
-+		if (nvlist_alloc(&nvl_result, NV_UNIQUE_NAME, 0) != 0) {
-+			*errmsg = strdup("nvlist_alloc failed\n");
-+			return NULL;
-+		}
-+
-+		if (nvlist_add_string(nvl_result, "error", "Indri Error")) {
-+			*errmsg = strdup("nvlist_add error failed\n");
-+			return NULL;
-+                }
-+
-+		nvlist_dup(nvl_result, &nvl_list_result[0], 0);
-+		nvlist_free(nvl_result);
-+		nvlist_add_nvlist_array(results, "results", nvl_list_result, 1);
-+
-+		return results;
-+        }
-+}
-+
-+extern "C" nvlist*
-+pia_search (nvlist_t *search_params, char **errmsg) {
-+
-+	return search (search_params, errmsg);
-+
-+}
-+
-+int
-+init () {
-+
-+	struct stat sb;
-+	if (stat(PIA_DATABASE_STORAGE, &sb) != 0) {
-+		return 1;
-+	}
-+
-+	return 0;
-+}
-+
-+extern "C" int
-+pia_init () {
-+
-+	return init ();
-+
-+}
---- indri-5.4/src/TextTokenizerPIA.l	po črc 15 14:38:12 2013
-+++ indri-5.4/src/TextTokenizerPIA.l	po črc 15 14:36:55 2013
[email protected]@ -0,0 +1,588 @@
-+%option noyywrap
-+%option never-interactive
-+%option prefix="piatok"
-+
-+%{
-+
-+/*==========================================================================
-+ * Copyright (c) 2004 University of Massachusetts.  All Rights Reserved.
-+ *
-+ * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
-+ * is subject to the terms of the software license set forth in the LICENSE
-+ * file included with this software, and also available at
-+ * http://www.lemurproject.org/license.html
-+ *
-+ *==========================================================================
-+ */
-+
-+//
-+// TextTokenizerPIA
-+//
-+// 15 September 2005 -- mwb
-+//
-+
-+#include <string.h>
-+#include <ctype.h>
-+#include "indri/TextTokenizerPIA.hpp"
-+#include "indri/TermExtent.hpp"
-+#include "indri/TagEvent.hpp"
-+#include "indri/TokenizedDocument.hpp"
-+#include "indri/UnparsedDocument.hpp"
-+#include "indri/UTF8Transcoder.hpp"
-+#include "indri/AttributeValuePair.hpp"
-+
-+static long byte_position;
-+
-+#define ZAP           1
-+#define TAG           2
-+#define ASCII_TOKEN   3
-+#define UTF8_TOKEN    4
-+
-+%}
-+%start COMMENT
-+%%
-+
-+"<!--" { BEGIN(COMMENT); byte_position += piatokleng; return ZAP; }
-+<COMMENT>[^-]+ { byte_position += piatokleng; return ZAP; }
-+<COMMENT>"-->" { BEGIN(INITIAL); byte_position += piatokleng; return ZAP; }
-+<COMMENT>"-" { byte_position += piatokleng; return ZAP; }
-+"<!"[^\>]*">" { byte_position += piatokleng; return ZAP; }
-+\<[a-zA-Z/][^\>]*\>                                             { byte_position += piatokleng; return TAG; }
-+[&]([a-zA-Z]+|[#]([0-9]+|[xX][a-fA-F0-9]+))[;]         { byte_position += piatokleng; return ZAP; /* symbols */ }
-+[A-Z0-9]"."([A-Z0-9]".")*                                        { byte_position += piatokleng; return ASCII_TOKEN; }
-+[a-zA-Z0-9_']+                                        { byte_position += piatokleng; return ASCII_TOKEN; }
-+"-"[0-9]+("."[0-9]+)?                                  { byte_position += piatokleng; return ASCII_TOKEN; }
-+[a-zA-Z0-9\x80-\xFD]+                               { byte_position += piatokleng; return UTF8_TOKEN; }
-+
-+[\n]                                                   { byte_position += piatokleng; return ZAP; }
-+.                                                      { byte_position += piatokleng; return ZAP; }
-+
-+%%
-+
-+indri::parse::TokenizedDocument* indri::parse::TextTokenizerPIA::tokenize( indri::parse::UnparsedDocument* document ) {
-+
-+  _termBuffer.clear();
-+  if ( _tokenize_entire_words)
-+    _termBuffer.grow( document->textLength * 4);
-+  else
-+    _termBuffer.grow( document->textLength * 8 ); // extra null per char.
-+
-+  _document.terms.clear();
-+  _document.tags.clear();
-+  _document.positions.clear();
-+
-+  _document.metadata = document->metadata;
-+  _document.text = document->text;
-+  _document.textLength = document->textLength;
-+  _document.content = document->content;
-+  _document.contentLength = document->contentLength;
-+
-+  // byte offset
-+  byte_position = document->content - document->text;
-+
-+  piatok_scan_bytes( document->content, document->contentLength );
-+
-+  // Main Tokenizer loop
-+
-+  int type;
-+
-+  while ( type = piatoklex() ) {
-+
-+    switch ( type ) {
-+
-+    case ASCII_TOKEN: processASCIIToken(); break;
-+
-+    case UTF8_TOKEN: processUTF8Token(); break;
-+
-+    case TAG: if ( _tokenize_markup ) processTag(); break;
-+
-+    default:
-+    case ZAP:
-+      break;
-+
-+    }
-+
-+  }
-+
-+  piatok_delete_buffer( YY_CURRENT_BUFFER );
-+
-+  return &_document;
-+}
-+
-+// Member functions for processing tokenization events as dispatched
-+// from the main tokenizer loop
-+
-+void indri::parse::TextTokenizerPIA::processTag() {
-+
-+  // Here, we parse the tag in a fashion that is relatively robust to
-+  // malformed markup.  toktext matches this pattern: <[^>]+>
-+
-+  if ( piatoktext[1] == '?' || piatoktext[1] == '!' ) {
-+
-+    // XML declaration like <? ... ?> and <!DOCTYPE ... >
-+    return; // ignore
-+
-+  } else if ( piatoktext[1] == '/' ) { // close tag, eg. </FOO>
-+
-+    // Downcase the tag name.
-+
-+    int len = 0;
-+
-+    for ( char *c = piatoktext + 2;
-+#ifndef WIN32
-+          isalnum( *c ) || *c == '-' || *c == '_' || *c == ':' ; c++ ) {
-+#else
-+          ((*c >= 0) && isalnum( *c )) || *c == '-' || *c == '_' || *c == ':' ; c++ ) {
-+#endif
-+
-+      *c = tolower( *c );
-+      if ( *c == ':' ) *c = '_'; /* replace colon (from namespaces) */
-+      len++;
-+    }
-+
-+    TagEvent te;
-+
-+    te.open_tag = false;
-+
-+    // We need to write len characters, plus a NULL
-+    char* write_loc = _termBuffer.write( len + 1 );
-+    strncpy( write_loc, piatoktext + 2, len );
-+    write_loc[len] = '\0';
-+    te.name = write_loc;
-+
-+    // token position of tag event w/r/t token string
-+    te.pos = _document.terms.size();
-+
-+    te.begin = byte_position - piatokleng;
-+    te.end = byte_position;
-+
-+    _document.tags.push_back( te );
-+
-+#ifndef WIN32
-+    } else if ( isalpha( piatoktext[1] ) ) {
-+#else
-+    } else if ( (piatoktext[1]  >= 0) && (isalpha( piatoktext[1] ) )) {
-+#endif
-+
-+    // Try to extract the tag name:
-+
-+    char* c = piatoktext + 1;
-+    int i = 0;
-+    int offset = 1; // current offset w/r/t byte_position - piatokleng
-+    // it starts at one because it is incremented when c is, and c starts at one.
-+    char* write_loc;
-+
-+#ifndef WIN32
-+    while ( isalnum( c[i] ) || c[i] == '-' || c[i] == '_' || c[i] == ':' ) i++;
-+#else
-+    while ( ( (c[i] >= 0) && isalnum( c[i] )) || c[i] == '-' || c[i] == '_' || c[i] == ':' ) i++;
-+#endif
-+    if ( c[i] == '>' ) {
-+
-+      // open tag with no attributes, eg. <title>
-+
-+      // Ensure tag name is downcased
-+      for ( int j = 0; j < i; j++ ) {
-+        c[j] = tolower( c[j] );
-+        if ( c[j] == ':' ) c[j] = '_'; /* replace colon (from namespaces) */
-+      }
-+
-+      TagEvent te;
-+
-+      te.open_tag = true;
-+
-+      // need to write i characters, plus a NULL
-+      char* write_loc = _termBuffer.write( i + 1 );
-+      strncpy( write_loc, c, i );
-+      write_loc[i] = '\0';
-+      te.name = write_loc;
-+
-+      te.pos = _document.terms.size();
-+
-+      te.begin = byte_position - piatokleng;
-+      te.end = byte_position;
-+
-+      _document.tags.push_back( te );
-+
-+#ifndef WIN32
-+    } else if ( isspace( c[i] ) ) {
-+#else
-+    } else if ( (c[i]  >= 0) && (isspace( c[i] ) )) {
-+#endif
-+
-+      // open tag with attributes, eg. <A HREF="www.foo.com/bar">
-+
-+      TagEvent te;
-+
-+      te.open_tag = true;
-+
-+      // Ensure tag name is downcased
-+      for ( int j = 0; j < i; j++ ) {
-+        c[j] = tolower( c[j] );
-+        if ( c[j] == ':' ) c[j] = '_'; /* replace colon (from namespaces) */
-+      }
-+
-+      // need to write i characters, plus a NULL
-+      char* write_loc = _termBuffer.write( i + 1 );
-+      strncpy( write_loc, c, i );
-+      write_loc[i] = '\0';
-+      te.name = write_loc;
-+      c += i;
-+      offset += i;
-+
-+#ifndef WIN32
-+    while ( isspace( *c ) ) { c++; offset++; }
-+#else
-+    while (((*c) >=0) &&  isspace( *c )) { c++; offset++; }
-+#endif
-+
-+      te.pos = _document.terms.size();
-+
-+      te.begin = byte_position - piatokleng;
-+      te.end = byte_position;
-+
-+      // Now search for attributes:
-+
-+      while ( *c != '>' && *c != '\0' ) {
-+
-+        AttributeValuePair avp;
-+
-+        // Try to extract attribute name:
-+
-+        i = 0;
-+#ifndef WIN32
-+        while ( isalnum( c[i] ) || c[i] == '-' || c[i] == '_' ) i++;
-+#else
-+        while ( (c[i] >= 0) && isalnum( c[i] ) || c[i] == '-' || c[i] == '_') i++;
-+#endif
-+
-+        if ( i == 0 ) break;
-+
-+        // Ensure attribute name is downcased
-+        for ( int j = 0; j < i; j++ )
-+          c[j] = tolower( c[j] );
-+
-+        // need to write i characters, plus a NULL
-+        write_loc = _termBuffer.write( i + 1 );
-+        strncpy( write_loc, c, i );
-+        write_loc[i] = '\0';
-+        avp.attribute = write_loc;
-+        c += i;
-+        offset += i;
-+
-+        // attributes can be foo\s*=\s*"bar[">] or foo\s*=\s*bar
-+
-+		// ignore any spaces
-+#ifndef WIN32
-+    while ( isspace( *c ) ) { c++; offset++; }
-+#else
-+    while (((*c) >=0) &&  isspace( *c )) { c++; offset++; }
-+#endif
-+
-+        if ( *c == '=' ) {
-+
-+          c++; // get past the '=' sign.
-+          offset++;
-+
-+#ifndef WIN32
-+    while ( isspace( *c ) ) { c++; offset++; }
-+#else
-+    while (((*c) >=0) &&  isspace( *c )) { c++; offset++; }
-+#endif
-+
-+          if ( *c == '>' ) {
-+
-+            // common malformed markup <a href=>
-+
-+            // Insert empty attribute value
-+            // need to write a single NULL
-+            write_loc = _termBuffer.write( 1 );
-+            write_loc[0] = '\0';
-+            avp.value = write_loc;
-+            avp.begin = byte_position - piatokleng + offset;
-+            avp.end = byte_position - piatokleng + offset;
-+
-+          } else {
-+
-+            bool quoted = true;
-+            char quote_char;
-+            if ( *c == '"' || *c =='\'' ) { quote_char = *c; c++; offset++; }
-+            else quoted = false;
-+
-+            // Attribute value starts here.
-+
-+            i = 0;
-+// make sure the opening and closing quote character match...
-+            if ( quoted )
-+//              while ( c[i] != '"' && c[i] != '>' && c[i] !='\'') i++;
-+              while ( c[i] != quote_char && c[i] != '>') i++;
-+            else
-+#ifndef WIN32
-+              while ( ! isspace( c[i] ) && c[i] != '>' ) i++;
-+#else
-+              while ( ((c[i] >= 0)  && ! isspace( c[i] ) ) && c[i] != '>' ) i++;
-+#endif
-+
-+            // need to write i characters, plus a NULL
-+            write_loc = _termBuffer.write( i + 1 );
-+            strncpy( write_loc, c, i );
-+            write_loc[i] = '\0';
-+            avp.value = write_loc;
-+            avp.begin = byte_position - piatokleng + offset;
-+            avp.end = byte_position - piatokleng + offset + i;
-+            c += i;
-+            offset += i;
-+
-+          }
-+        } else {
-+
-+          // Insert empty attribute value
-+          // need to write a single NULL
-+          write_loc = _termBuffer.write( 1 );
-+          write_loc[0] = '\0';
-+          avp.value = write_loc;
-+          avp.begin = byte_position - piatokleng + offset;
-+          avp.end = byte_position - piatokleng + offset;
-+        }
-+#ifndef WIN32
-+        while ( isspace( *c ) || *c == '"' ) { c++; offset++; }
-+#else
-+        while ( ((*c >= 0) && isspace( *c )) || *c == '"' ) { c++; offset++; }
-+#endif
-+
-+        te.attributes.push_back( avp );
-+      }
-+
-+      _document.tags.push_back( te );
-+
-+    }
-+
-+    // One of the cases that is ignored is this common malformed
-+    // markup <foo=bar> with no tag name.  Another is the case
-+    // of an email address <[email protected]>
-+
-+
-+  }
-+}
-+
-+void indri::parse::TextTokenizerPIA::processUTF8Token() {
-+
-+  // A UTF-8 token, as recognized by flex, could actually be
-+  // a mixed ASCII/UTF-8 string containing any number of
-+  // UTF-8 characters, so we re-tokenize it here.
-+
-+  indri::utility::HashTable<UINT64,const int>& unicode = _transcoder.unicode();
-+
-+  int len = strlen( piatoktext );
-+
-+  UINT64* unicode_chars = new UINT64[len + 1];
-+  int* offsets = new int[len + 1];
-+  int* lengths = new int[len + 1];
-+  _transcoder.utf8_decode( piatoktext, &unicode_chars, NULL, NULL,
-+                           &offsets, &lengths );
-+
-+  const int* p;
-+  int cls;             // Character class of current UTF-8 character
-+  // offset of current UTF-8 character w/r/t toktext stored in offsets[i]
-+  // byte length of current UTF-8 character stored in lengths[i]
-+
-+  int offset = 0;      // Position of start of current *token* (not character) w/r/t toktext
-+  int extent = 0;      // Extent for this *token* including trailing punct
-+  int piatoken_len = 0;   // Same as above, minus the trailing punctuation
-+
-+  char buf[64];
-+
-+  // If this flag is true, we have punctuation symbols at the end of a
-+  // token, so do not attach another letter to this token.
-+  bool no_letter = false;
-+
-+  // In case there are malformed characters preceding the good
-+  // characters:
-+  offset = offsets[0];
-+
-+  for ( int i = 0; unicode_chars[i] != 0; i++ ) {
-+
-+    p = unicode.find( unicode_chars[i] );
-+    cls = p ? *p : 0;
-+
-+    if ( ! _tokenize_entire_words ) { // Tokenize by character
-+
-+      if ( cls != 0 && cls != 3 && cls != 5 && cls != 9 ) {
-+
-+        writeToken( piatoktext + offsets[i], lengths[i],
-+                    byte_position - piatokleng + offsets[i],
-+                    byte_position - piatokleng + offsets[i] + lengths[i] );
-+      }
-+      continue;
-+    }
-+
-+    // If this is not the first time through this loop, we need
-+    // to check to see if any bytes in toktext were skipped
-+    // during the UTF-8 analysis:
-+
-+    if ( i != 0 && offset + piatoken_len != offsets[i] ) {
-+
-+      // Write out the token we are working on, if any:
-+
-+      if ( piatoken_len > 0 ) {
-+
-+        writeToken( piatoktext + offset, piatoken_len,
-+                    byte_position - piatokleng + offset,
-+                    byte_position - piatokleng + offset + extent );
-+      }
-+
-+      extent = 0;
-+      piatoken_len = 0;
-+      no_letter = false;
-+      offset = offsets[i];
-+    }
-+
-+    // Tokenize by word:
-+
-+    switch ( cls ) {
-+
-+    case 4: // Currency symbol: always extracted alone
-+      // Action: write the token we are working on,
-+      // and write this symbol as a separate token
-+      writeToken( piatoktext + offset, extent,
-+                  byte_position - piatokleng + offset,
-+                  byte_position - piatokleng + offset + extent );
-+
-+      offset += extent;
-+
-+      writeToken( piatoktext + offset, lengths[i],
-+                  byte_position - piatokleng + offset,
-+                  byte_position - piatokleng + offset + lengths[i] );
-+
-+      offset += lengths[i];
-+      piatoken_len = 0;
-+      extent = 0;
-+      no_letter = false;
-+      break;
-+
-+    case 1: // Apostrophe
-+    case 10: // Decimal separator
-+    case 6: // Letter
-+    case 7: // Digit
-+      // Action: add this character to the end of the token we are
-+      // working on
-+      if ( no_letter ) { // This is a token boundary
-+        writeToken( piatoktext + offset, piatoken_len,
-+                    byte_position - piatokleng + offset,
-+                    byte_position - piatokleng + offset + extent );
-+
-+        offset += extent;
-+        extent = 0;
-+        piatoken_len = 0;
-+        no_letter = false;
-+
-+      }
-+
-+      extent += lengths[i];
-+      piatoken_len += lengths[i];
-+      break;
-+
-+    case 2: // Percent
-+    case 8: // Punctuation
-+    case 12: // Thousands separator
-+    case 11: // Hyphen
-+      // Action: These characters are included in the extent of the
-+      // token we are working on.
-+      no_letter = true;
-+      extent += lengths[i];
-+      break;
-+
-+    case 0: // No character class!
-+    case 3: // Control character
-+    case 5: // Non-punctuation symbol
-+    case 9: // Whitespace
-+    default:
-+      // Action: write the token we are working on.  Do not include
-+      // this character in any future token.
-+      writeToken( piatoktext + offset, piatoken_len,
-+                  byte_position - piatokleng + offset,
-+                  byte_position - piatokleng + offset + extent );
-+
-+      offset += (extent + lengths[i]); // Include current character
-+      extent = 0;
-+      piatoken_len = 0;
-+      no_letter = false;
-+
-+      break;
-+    }
-+  }
-+
-+  // Write out last token
-+  if ( piatoken_len > 0 )
-+    writeToken( piatoktext + offset, piatoken_len,
-+                byte_position - piatokleng + offset,
-+                byte_position - piatokleng + offset + extent );
-+
-+  delete[] unicode_chars;
-+  delete[] offsets;
-+  delete[] lengths;
-+}
-+
-+void indri::parse::TextTokenizerPIA::processASCIIToken() {
-+
-+  int piatoken_len = strlen( piatoktext );
-+
-+  // token_len here is the length of the token without
-+  // any trailing punctuation.
-+
-+  for ( int i = piatoken_len - 1; i > 0; i-- ) {
-+
-+    if ( ! ispunct( piatoktext[i] ) )
-+      break;
-+    else
-+      piatoken_len--;
-+  }
-+
-+  if ( _tokenize_entire_words ) {
-+
-+    writeToken( piatoktext, piatoken_len, byte_position - piatokleng, byte_position );
-+
-+  } else {
-+
-+    for ( int i = 0; i < piatoken_len; i++ )
-+      writeToken( piatoktext + i, 1, byte_position - piatokleng + i,
-+                  byte_position - piatokleng + i + 1 );
-+  }
-+}
-+
-+
-+// ObjectHandler implementation
-+
-+void indri::parse::TextTokenizerPIA::handle( indri::parse::UnparsedDocument* document ) {
-+
-+  _handler->handle( tokenize( document ) );
-+}
-+
-+void indri::parse::TextTokenizerPIA::setHandler( ObjectHandler<indri::parse::TokenizedDocument>& h ) {
-+
-+  _handler = &h;
-+}
-+
-+void indri::parse::TextTokenizerPIA::writeToken( char* token, int piatoken_len,
-+                                              int extent_begin, int extent_end ) {
-+
-+
-+  // The TermExtent for a token will include trailing punctuation.
-+  // The purpose for this is that it makes for a nicer display when a
-+  // sequence of tokens (say, a sentence) is retrieved and shown to
-+  // the user.
-+
-+  TermExtent extent;
-+  extent.begin = extent_begin;
-+  extent.end = extent_end;
-+  _document.positions.push_back( extent );
-+
-+  // The terms entry for a token won't include the punctuation.
-+
-+  char* write_loc = _termBuffer.write( piatoken_len + 1 );
-+  strncpy( write_loc, token, piatoken_len );
-+  write_loc[piatoken_len] = '\0';
-+  _document.terms.push_back( write_loc );
-+}
-+
-+
---- indri-5.4/include/indri/TextTokenizerPIA.hpp	po črc 15 14:38:50 2013
-+++ indri-5.4/include/indri/TextTokenizerPIA.hpp	po črc 15 14:36:54 2013
[email protected]@ -0,0 +1,73 @@
-+/*==========================================================================
-+ * Copyright (c) 2003-2005 University of Massachusetts.  All Rights Reserved.
-+ *
-+ * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
-+ * is subject to the terms of the software license set forth in the LICENSE
-+ * file included with this software, and also available at
-+ * http://www.lemurproject.org/license.html
-+ *
-+ *==========================================================================
-+ */
-+
-+//
-+// TextTokenizerPIA
-+//
-+// 15 September 2005 -- mwb
-+//
-+
-+#ifndef INDRI_TEXTTOKENIZERPIA_HPP
-+#define INDRI_TEXTTOKENIZERPIA_HPP
-+
-+#include <stdio.h>
-+#include <string>
-+#include <map>
-+
-+#include "indri/IndriTokenizer.hpp"
-+#include "indri/Buffer.hpp"
-+#include "indri/TagEvent.hpp"
-+#include "indri/UnparsedDocument.hpp"
-+#include "indri/TokenizedDocument.hpp"
-+#include "indri/UTF8Transcoder.hpp"
-+
-+namespace indri {
-+  namespace parse {
-+
-+    class TextTokenizerPIA : public Tokenizer {
-+
-+    public:
-+      TextTokenizerPIA( bool tokenize_markup = true, bool tokenize_entire_words = true ) : _handler(0) {
-+
-+        _tokenize_markup = tokenize_markup;
-+        _tokenize_entire_words = tokenize_entire_words;
-+      }
-+
-+      ~TextTokenizerPIA() {}
-+
-+      TokenizedDocument* tokenize( UnparsedDocument* document );
-+
-+      void handle( UnparsedDocument* document );
-+      void setHandler( ObjectHandler<TokenizedDocument>& h );
-+
-+    protected:
-+      void processASCIIToken();
-+      void processUTF8Token();
-+      void processTag();
-+
-+      indri::utility::Buffer _termBuffer;
-+      UTF8Transcoder _transcoder;
-+
-+      bool _tokenize_markup;
-+      bool _tokenize_entire_words;
-+
-+    private:
-+      ObjectHandler<TokenizedDocument>* _handler;
-+      TokenizedDocument _document;
-+
-+      void writeToken( char* token, int token_len, int extent_begin,
-+                       int extent_end );
-+    };
-+  }
-+}
-+
-+#endif // INDRI_TEXTTOKENIZERPIA_HPP
-+
---- indri-5.4/src/TokenizerFactory.cpp	po črc 15 14:39:30 2013
-+++ indri-5.4/src/TokenizerFactory.cpp	po črc 15 14:29:11 2013
[email protected]@ -22,6 +22,7 @@
-
- #include "indri/TextTokenizer.hpp"
- // Add an #include for your Tokenizer here.
-+#include "indri/TextTokenizerPIA.hpp"
-
-
- #define TOKENIZER_WORD ("Word")
[email protected]@ -29,6 +30,8 @@
- #define TOKENIZER_CHAR ("Char")
- #define TOKENIZER_CHAR_NO_MARKUP ("Char without Markup")
- // Add a #define for your Tokenizer here.
-+#define TOKENIZER_PIA ("PIA")
-+#define TOKENIZER_PIA_NO_MARKUP ("PIA without Markup")
-
-
- //
[email protected]@ -78,8 +81,23 @@
-     // got "char"
-     return TOKENIZER_CHAR;
-
-+  } else if ( ( name[0] == 'p' || name[0] == 'P' ) &&
-+       ( name[1] == 'i' || name[1] == 'I' ) &&
-+       ( name[2] == 'a' || name[3] == 'A' ) ) {
-+
-+    if ( name[4] == '-' &&
-+         ( name[5] == 'n' || name[5] == 'N' ) &&
-+         ( name[5] == 'o' || name[5] == 'O' ) ) {
-+
-+      // got "pia-nomarkup"
-+      return TOKENIZER_PIA_NO_MARKUP;
-+    }
-+
-+    // got "pia"
-+    return TOKENIZER_PIA;
-   }
-
-+
-   return "";
- }
-
[email protected]@ -105,6 +123,14 @@
-
-     tokenizer = new indri::parse::TextTokenizer( false, false );
-
-+  } else if ( preferred == TOKENIZER_PIA ) {
-+
-+    tokenizer = new indri::parse::TextTokenizerPIA();
-+
-+  } else if ( preferred == TOKENIZER_PIA_NO_MARKUP ) {
-+
-+    tokenizer = new indri::parse::TextTokenizerPIA( false );
-+
-   } else {
-
-     LEMUR_THROW( LEMUR_RUNTIME_ERROR, name + " is not a known tokenizer." );
---- indri-5.4/src/FileClassEnvironmentFactory.cpp	po črc 15 14:40:19 2013
-+++ indri-5.4/src/FileClassEnvironmentFactory.cpp	po črc 15 14:29:12 2013
[email protected]@ -189,6 +189,20 @@
-     trec_conflations      // conflations
-   },
-   {
-+    "trecpia",           // name
-+    "xml",                // parser
-+    "pia",               // tokenizer
-+    "tagged",             // iterator
-+    "<DOC>",              // startDocTag
-+    "</DOC>",             // endDocTag
-+    NULL,                 // endMetadataTag
-+    trec_include_tags,    // includeTags
-+    NULL,                 // excludeTags
-+    trec_index_tags,      // indexTags
-+    trec_metadata_tags,   // metadataTags
-+    trec_conflations      // conflations
-+  },
-+  {
-     "trecchar",           // name
-     "xml",                // parser
-     "char",               // tokenizer
---- indri-5.4/Makefile.app.in	2013-09-04 06:31:06.740210927 -0700
-+++ indri-5.4/Makefile.app.in	2013-09-04 06:27:24.857989779 -0700
[email protected]@ -1,22 +1,26 @@
-+include MakeDefns
-+
- ## your application name here
--APP=
-+APP=pia_wrapper
- SRC=$(APP).cpp
- ## extra object files for your app here
- OBJ=
-+OUTPUT=lib$(APP).so.1
-
- prefix = @[email protected]
- exec_prefix = ${prefix}
- libdir = @[email protected]
- includedir = @[email protected]
--INCPATH=-I$(includedir)
--LIBPATH=-L$(libdir)
-+INCPATH=-Iinclude -Icontrib/lemur/include
-+LIBPATH=-Lobj
- [email protected]@ @[email protected] @[email protected] $(INCPATH)
--CPPLDFLAGS  = @[email protected] -lindri @[email protected]
-+CPPLDFLAGS  = @[email protected] -lnvpair -lindri @[email protected]
-
- all:
--	$(CXX) $(CXXFLAGS) $(SRC) -o $(APP) $(OBJ) $(LIBPATH) $(CPPLDFLAGS)
-+	$(CXX) $(CXXFLAGS) $(SRC) -fpic -shared -static-libgcc -h $(OUTPUT) -o $(OUTPUT) $(OBJ) $(LIBPATH) $(CPPLDFLAGS)
-
- clean:
- 	rm -f $(APP)
-
--
-+install:
-+	cp $(OUTPUT) $(libdir)
---- indri-5.4/Makefile	2013-09-12 07:39:16.027125829 -0700
-+++ indri-5.4/Makefile	2013-09-12 07:38:44.720450641 -0700
[email protected]@ -73,5 +73,6 @@
- 	$(MAKE) install -C doc
- 	$(MAKE) -C site-search install
- 	$(INSTALL_DATA) Makefile.app $(pkgdatadir)
-+	$(MAKE) -f Makefile.app install
-
- test:
--- a/components/indri/patches/remove_xpdf.patch	Thu Dec 17 23:00:14 2015 -0800
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,358 +0,0 @@
-Remove xpdf support from the build.
---- indri-5.4/MakeDefns.in	čt črc  4 15:01:17 2013
-+++ indri-5.4/MakeDefns.in	čt črc  4 15:00:40 2013
[email protected]@ -48,7 +48,7 @@
- PHPINCLUDE = @[email protected]
- [email protected]@
- 
--DEPENDENCIES = lemur xpdf
-+DEPENDENCIES = lemur
- ifeq ($(NEED_ANTLR), 1)
-   DEPENDENCIES += antlr
- endif
---- indri-5.4/src/PDFDocumentExtractor.cpp	čt črc  4 15:08:46 2013
-+++ indri-5.4/src/PDFDocumentExtractor.cpp	čt črc  4 15:08:28 2013
[email protected]@ -1,214 +1,0 @@
--/*==========================================================================
-- * Copyright (c) 2003-2004 University of Massachusetts.  All Rights Reserved.
-- *
-- * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
-- * is subject to the terms of the software license set forth in the LICENSE
-- * file included with this software, and also available at
-- * http://www.lemurproject.org/license.html
-- *
-- *==========================================================================
--*/
--
--
--//
--// PDFDocumentExtractor
--//
--// 25 June 2004 -- tds
--//
--
--#include "indri/PDFDocumentExtractor.hpp"
--#include "indri/Buffer.hpp"
--
--#include "GString.h"
--#include "TextOutputDev.h"
--#include "PDFDoc.h"
--
--#include "Object.h"
--#include "Stream.h"
--#include "Array.h"
--#include "Dict.h"
--#include "XRef.h"
--#include "Page.h"
--#include "CharTypes.h"
--#include "GlobalParams.h"
--#include "lemur/Exception.hpp"
--
--static void buffer_write( void* stream, char* text, int len ) {
--  indri::utility::Buffer* buffer = (indri::utility::Buffer*) stream;
--
--  if( buffer->position() ) {
--    buffer->unwrite(1);
--  }
--
--  memcpy( buffer->write(len), text, len );
--  if( text[len-1] != 0 )
--    *buffer->write(1) = 0;
--}
--
--indri::parse::PDFDocumentExtractor::PDFDocumentExtractor() {
--  globalParams = new GlobalParams(0);
--  _title="";
--  _author="";
--}
--
--indri::parse::PDFDocumentExtractor::~PDFDocumentExtractor() {
--  delete globalParams;
--  globalParams = 0;
--}
--
--
--void indri::parse::PDFDocumentExtractor::seekValue(indri::xml::XMLNode* node, std::string &metaTag) {
--  if (node == NULL) {
--    return;
--  }
--
--  const std::vector<indri::xml::XMLNode*>& children = node->getChildren();
--  for( size_t i=0; i<children.size(); i++ ) {
--    indri::xml::XMLNode* child = children[i];
--    metaTag = child->getValue();
--	if(metaTag.length()==0)
--		seekValue(child,metaTag);
--	else
--		return;
--  }
--
--}
--
--void indri::parse::PDFDocumentExtractor::appendPdfMetaData(indri::xml::XMLNode* node) {
--  indri::xml::XMLNode* current = 0;
--
--  if (node == NULL) {
--    return;
--  }
--
--  const std::vector<indri::xml::XMLNode*>& children = node->getChildren();
--
--  for( size_t i=0; i<children.size(); i++ ) {
--    indri::xml::XMLNode* child = children[i];
--    std::string name = child->getName();
--	if(name=="dccreator")
--	{
--		seekValue(child,_author);
--	}
--	if(name=="dctitle")
--	{
--		seekValue(child,_title);
--	}
--	appendPdfMetaData(child);
--
--  }
--
--
--
--}
--
--void indri::parse::PDFDocumentExtractor::open( const std::string& filename ) {
--  _documentTextBuffer.clear();
--  _documentPath = filename;
--}
--
--void indri::parse::PDFDocumentExtractor::close() {
--  _documentPath = "";
--}
--
--indri::parse::UnparsedDocument* indri::parse::PDFDocumentExtractor::nextDocument() {
--  if( !_documentPath.length() )
--    return 0;
--
--  PDFDoc* doc = 0;
--  TextOutputDev* textOut = 0;
--  GString* gfilename = new GString(_documentPath.c_str());
--  doc = new PDFDoc( gfilename );
--  // if the doc is not ok, or ok to copy, it
--  // will be a document of length 0.
--  if( doc->isOk() && doc->okToCopy() ) {
--    void* stream = &_documentTextBuffer;
--    textOut = new TextOutputDev( buffer_write, stream, gFalse, gFalse);
--    if ( textOut->isOk() ) {
--      int firstPage = 1;
--      int lastPage = doc->getNumPages();
--	  double hDPI=72.0;
--	  double vDPI=72.0;
--	  int rotate=0;
--	  GBool useMediaBox=gFalse;
--	  GBool crop=gTrue; 
--	  GBool printing=gFalse; 
--	  if(doc->readMetadata()!=NULL)
--	  {
--		  GString rawMetaData = doc->readMetadata();
--		  GString preparedMetaData="";
--
--		  //zoek <rdf:RDF  en eindig bij </rdf:RDF>!! 
--		  for(int x=0; x<rawMetaData.getLength(); x++) {
--			  if(rawMetaData.getChar(x)!='?' && rawMetaData.getChar(x)!=':') {
--				  //skip characters which the XMLReader doesn't understand
--				  preparedMetaData.append(rawMetaData.getChar(x));
--			  }
--		  }
--		  std::string metaData(preparedMetaData.getCString());
--		  int startbegin = metaData.find("<rdf");
--		  int stopend = metaData.find(">", metaData.rfind("</rdf") );
--		  metaData = metaData.substr(startbegin, (stopend-startbegin)+1 );
--	  
--
--     	  indri::xml::XMLReader reader;
--
--		  try {
--			  std::auto_ptr<indri::xml::XMLNode> result( reader.read( metaData.c_str() ) );
--			  appendPdfMetaData( result.get() );
--		  } catch( lemur::api::Exception& e ) {
--			LEMUR_RETHROW( e, "Had trouble reading PDF metadata" );
--		  } 
--		  if( _author.length()>0 || _title.length()>0 )
--		  {
--			std::string createdPdfHeader;
--			createdPdfHeader="<head>\n";
--			if(_title.length()>0) {
--				createdPdfHeader+="<title>";
--				createdPdfHeader+=_title;
--				createdPdfHeader+="</title>\n";
--			}
--			if(_author.length()>0) {
--				createdPdfHeader+="<author>";
--				createdPdfHeader+=_author;
--				createdPdfHeader+="</author>\n";
--			}
--			createdPdfHeader+="</head>\n";
--			char *metastream = _documentTextBuffer.write( createdPdfHeader.length()+1 );
--			strcpy(metastream, createdPdfHeader.c_str());
--		  }
--	  }
--      doc->displayPages(textOut, firstPage, lastPage, hDPI, vDPI, rotate, useMediaBox, crop, printing);
--    }
--  }
--  
--
--  delete textOut;
--  delete doc;
--
--  _unparsedDocument.textLength = _documentTextBuffer.position();
--  _unparsedDocument.contentLength = _unparsedDocument.textLength ? _documentTextBuffer.position() - 1 : 0 ; // no null 0 if text is empty.
--  char* docnoPoint = _documentTextBuffer.write( _documentPath.length()+1 );
--  strcpy( docnoPoint, _documentPath.c_str() );
--  _unparsedDocument.text = _documentTextBuffer.front();
--  _unparsedDocument.content = _documentTextBuffer.front();
--  _unparsedDocument.metadata.clear();
--
--  indri::parse::MetadataPair pair;
--
--  pair.key = "path";
--  pair.value = docnoPoint;
--  pair.valueLength = _documentPath.length()+1;
--  _unparsedDocument.metadata.push_back( pair );
--
--  _docnostring.assign(_documentPath.c_str() );
--  cleanDocno();
--  pair.value = _docnostring.c_str();
--  pair.valueLength = _docnostring.length()+1;
--  pair.key = "docno";
--  _unparsedDocument.metadata.push_back( pair );
-- 
--  _documentPath = "";
--
--  return &_unparsedDocument;
--}
---- indri-5.4/include/indri/PDFDocumentExtractor.hpp	čt črc  4 15:16:04 2013
-+++ indri-5.4/include/indri/PDFDocumentExtractor.hpp	čt črc  4 15:15:00 2013
[email protected]@ -1,57 +1,0 @@
--/*==========================================================================
-- * Copyright (c) 2003-2004 University of Massachusetts.  All Rights Reserved.
-- *
-- * Use of the Lemur Toolkit for Language Modeling and Information Retrieval
-- * is subject to the terms of the software license set forth in the LICENSE
-- * file included with this software, and also available at
-- * http://www.lemurproject.org/license.html
-- *
-- *==========================================================================
-- */
--
--
--//
--// PDFDocumentExtractor
--//
--// 25 June 2004 -- tds
--//
--
--#ifndef INDRI_PDFDOCUMENTEXTRACTOR_HPP
--#define INDRI_PDFDOCUMENTEXTRACTOR_HPP
--
--#include "lemur/lemur-compat.hpp"
--#include "indri/Buffer.hpp"
--#include "indri/UnparsedDocument.hpp"
--#include "indri/DocumentIterator.hpp"
--#include "indri/XMLReader.hpp"
--#include "indri/XMLNode.hpp"
--#include "indri/XMLWriter.hpp"
--#include <string>
--namespace indri
--{
--  namespace parse
--  {
--    
--    class PDFDocumentExtractor : public DocumentIterator {
--      indri::utility::Buffer _documentTextBuffer;
--      UnparsedDocument _unparsedDocument;
--      std::string _documentPath;
--  
--    public:
--      PDFDocumentExtractor();
--      ~PDFDocumentExtractor();
--
--      void open( const std::string& filename );
--      UnparsedDocument* nextDocument();
--	  void appendPdfMetaData(indri::xml::XMLNode* node);
--	  void seekValue(indri::xml::XMLNode* node, std::string &metaTag);
--      void close();
--	private:
--	  std::string _title;
--	  std::string _author;
--
--    };
--  }
--}
--
--#endif // INDRI_PDFDOCUMENTEXTRACTOR_HPP
---- indri-5.4/src/DocumentIteratorFactory.cpp	čt črc  4 15:24:24 2013
-+++ indri-5.4/src/DocumentIteratorFactory.cpp	čt črc  4 15:23:27 2013
[email protected]@ -18,7 +18,6 @@
- 
- #include "indri/DocumentIteratorFactory.hpp"
- 
--#include "indri/PDFDocumentExtractor.hpp"
- #include "indri/TaggedDocumentIterator.hpp"
- #include "indri/WARCDocumentIterator.hpp"
- #include "indri/TextDocumentExtractor.hpp"
[email protected]@ -36,7 +35,6 @@
- 
- #define TYPE_TAGGED   ( "Tagged Document Collection" )
- #define TYPE_WARC     ( "WARC Document Collection" )
--#define TYPE_PDF      ( "Adobe PDF" )
- #define TYPE_WORD     ( "Microsoft Word" )
- #define TYPE_PPT      ( "Microsoft PowerPoint" )
- #define TYPE_MBOX     ( "Mailbox" )
[email protected]@ -53,8 +51,6 @@
-     result = iter;
-   } else if( preferred == TYPE_WARC ) {
-     result = new indri::parse::WARCDocumentIterator();
--  } else if( preferred == TYPE_PDF ) {
--    result = new indri::parse::PDFDocumentExtractor();
-   } else if( preferred == TYPE_TEXT ) {
-     result = new indri::parse::TextDocumentExtractor();
-   } else if( preferred == TYPE_MBOX ) {
[email protected]@ -83,8 +79,6 @@
-     return TYPE_TAGGED;
-   } else if( type == "warc" || type == TYPE_WARC ) {
-     return TYPE_WARC;
--  } else if( type == "pdf" || type == "adobe pdf" || type == TYPE_PDF ) {
--    return TYPE_PDF;
-   } else if( type == "doc" || type == "msword" || type == "word" || type == "microsoft word" || type == TYPE_WORD ) {
-     return TYPE_WORD;
-   } else if( type == "ppt" || type == "powerpoint" || type == "msppt" || type == "microsoft powerpoint" || type == TYPE_PPT ) {
---- indri-5.4/src/FileClassEnvironmentFactory.cpp	čt črc  4 15:33:56 2013
-+++ indri-5.4/src/FileClassEnvironmentFactory.cpp	čt črc  4 15:33:20 2013
[email protected]@ -55,8 +55,6 @@
- // case.  Values specified here can be in mixed case, since values are
- // matched in a case-sensitive manner.
- 
--static const char* pdf_index_tags[] = { "title", "author", 0 };
--static const char* pdf_metadata_tags[] = { "title", "author", 0 };
- static const char* html_index_tags[] = { "title", "author", "h1", "h2", "h3", "h4", 0 };
- static const char* html_metadata_tags[] = { "title", "author", 0 };
- //static const char* html_conflations[] = { "h1", NULL, NULL, "heading", "h2", NULL, NULL, "heading", "h3", NULL, NULL, "heading", "h4", NULL, NULL, "heading", "bloghpno", NULL, NULL, "docno", 0, 0, 0, 0 };
[email protected]@ -279,21 +277,6 @@
- #endif
- 
-   {
--    "pdf",                // name
--    "html",               // parser
--    "word",               // tokenizer
--    "pdf",                // iterator
--    NULL,                 // startDocTag
--    NULL,                 // endDocTag
--    NULL,                 // endMetadataTag
--    NULL,                 // includeTags
--    NULL,                 // excludeTags
--    pdf_index_tags,       // indexTags
--    pdf_metadata_tags,    // metadataTags
--    NULL                  // conflations
--  },
--
--  {
-     "txt",                // name
-     "text",               // parser
-     "word",               // tokenizer
--- a/components/meta-packages/history/history	Thu Dec 17 23:00:14 2015 -0800
+++ b/components/meta-packages/history/history	Mon Dec 07 13:52:39 2015 +0100
@@ -299,6 +299,7 @@
 library/apr-util-13/[email protected],5.12-5.12.0.0.0.56.0
 library/apr-util-13/[email protected],5.12-5.12.0.0.0.56.0
 library/[email protected],5.12-5.12.0.0.0.56.0
+library/[email protected]
 library/java/[email protected],5.12-5.12.0.0.0.85.0
 library/perl-5/[email protected],5.12-5.12.0.0.0.81.0
 library/perl-5/[email protected],5.12-5.12.0.0.0.42.0