All Downloads are FREE. Search and download functionalities are using the official Maven repository.

src.it.unimi.dsi.big.mg4j.tool.URLMPHVirtualDocumentResolver Maven / Gradle / Ivy

Go to download

MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections written in Java. The big version is a fork of the original MG4J that can handle more than 2^31 terms and documents.

The newest version!
package it.unimi.dsi.big.mg4j.tool;

/*		 
 * MG4J: Managing Gigabytes for Java (big)
 *
 * Copyright (C) 2006-2011 Paolo Boldi 
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see .
 *
 */

import it.unimi.dsi.bits.TransformationStrategies;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.FileLinesCollection;
import it.unimi.dsi.io.LineIterator;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.logging.ProgressLogger;
import it.unimi.dsi.big.mg4j.document.Document;
import it.unimi.dsi.big.mg4j.util.MG4JClassParser;
import it.unimi.dsi.sux4j.mph.MWHCFunction;
import it.unimi.dsi.sux4j.mph.TwoStepsLcpMonotoneMinimalPerfectHashFunction;
import it.unimi.dsi.util.BloomFilter;
import it.unimi.dsi.big.util.ShiftAddXorSignedStringMap;
import it.unimi.dsi.big.util.StringMap;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.Collection;

import org.apache.commons.lang.RandomStringUtils;
import org.apache.log4j.Logger;

import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.UnflaggedOption;

/** A virtual-document resolver based on document URIs.
 *
 * 

Instances of this class store in a {@link StringMap} instances * all URIs from a collection, and consider a virtual-document specification a (possibly relative) URI. The * virtual-document specification is resolved against the document URI, and then the perfect hash is used * to retrieve the corresponding document. * *

This class provides a main method that helps in building serialised resolvers from URI lists. * In case of pathological document collections with duplicate URIs (most notably, the GOV2 collection * used for TREC evaluations), an option makes it possible to add random noise to duplicates, so that * minimal perfect hash construction does not go into an infinite loop. It is a rather crude solution, but it * is nonsensical to have duplicate URIs in the first place. Additional option include the kind of minimal perfect * hash function you want to use (e.g., out of {@link it.unimi.dsi.sux4j}) and the number of bits used to sign them. */ public class URLMPHVirtualDocumentResolver implements VirtualDocumentResolver { private static final long serialVersionUID = 1L; private static final Logger LOGGER = Logger.getLogger( URLMPHVirtualDocumentResolver.class ); /** The term map used by this resolver to associated URI strings to numbers. */ private final StringMap url2DocumentPointer; /** The cached URI of the last argument to {@link #context(Document)}. */ private transient URI documentURI; public URLMPHVirtualDocumentResolver( final StringMap url2DocumentPointer ) { this.url2DocumentPointer = url2DocumentPointer; } public void context( final Document document ) { try { documentURI = new URI( document.uri().toString() ).normalize(); } catch ( URISyntaxException e ) { documentURI = null; } } public long resolve( final CharSequence virtualDocumentSpec ) { try { URI virtualURI = URI.create( virtualDocumentSpec.toString() ).normalize(); if ( ! virtualURI.isAbsolute() ) { if ( documentURI == null ) return -1; virtualURI = documentURI.resolve( virtualURI ); } // TODO discard opaque? return url2DocumentPointer.getLong( virtualURI.toString() ); } catch ( Exception e ) { return -1; } } public long numberOfDocuments() { return url2DocumentPointer.size64(); } private static void makeUnique( final BloomFilter filter, final MutableString uri ) { while( ! filter.add( uri ) ) { LOGGER.debug( "Duplicate URI " + uri ); uri.append( '/' ).append( RandomStringUtils.randomAlphanumeric( 32 ) ); } } public static void main( final String[] arg ) throws JSAPException, IOException { final SimpleJSAP jsap = new SimpleJSAP( URLMPHVirtualDocumentResolver.class.getName(), "Builds a URL document resolver from a sequence of URIs, extracted typically using ScanMetadata, using a suitable function. You can specify that the list is sorted, in which case it is possible to generate a resolver that occupies less space.", new Parameter[] { new Switch( "sorted", 's', "sorted", "URIs are sorted: use a monotone minimal perfect hash function." ), new Switch( "iso", 'i', "iso", "Use ISO-8859-1 coding internally (i.e., just use the lower eight bits of each character)." ), new FlaggedOption( "bufferSize", JSAP.INTSIZE_PARSER, "64Ki", JSAP.NOT_REQUIRED, 'b', "buffer-size", "The size of the I/O buffer used to read terms." ), new FlaggedOption( "class", MG4JClassParser.getParser(), JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'c', "class", "A class used to create the function from URIs to their ranks; defaults to it.unimi.dsi.sux4j.mph.MHWCFunction for non-sorted inputs, and to it.unimi.dsi.sux4j.mph.TwoStepsLcpMonotoneMinimalPerfectHashFunction for sorted inputs." ), new FlaggedOption( "width", JSAP.INTEGER_PARSER, Integer.toString( Long.SIZE ), JSAP.NOT_REQUIRED, 'w', "width", "The width, in bits, of the signatures used to sign the function from URIs to their rank." ), new FlaggedOption( "termFile", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'o', "offline", "Read terms from this file (without loading them into core memory) instead of standard input." ), new FlaggedOption( "uniqueUris", JSAP.INTSIZE_PARSER, JSAP.NO_DEFAULT, JSAP.NOT_REQUIRED, 'U', "unique-uris", "Force URIs to be unique by adding random garbage at the end of duplicates; the argument is an upper bound for the number of URIs that will be read, and will be used to create a Bloom filter." ), new UnflaggedOption( "resolver", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, JSAP.REQUIRED, JSAP.NOT_GREEDY, "The filename for the resolver." ) }); JSAPResult jsapResult = jsap.parse( arg ); if ( jsap.messagePrinted() ) return; final int bufferSize = jsapResult.getInt( "bufferSize" ); final String resolverName = jsapResult.getString( "resolver" ); //final Class tableClass = jsapResult.getClass( "class" ); final boolean iso = jsapResult.getBoolean( "iso" ); String termFile = jsapResult.getString( "termFile" ); BloomFilter filter = null; final boolean uniqueURIs = jsapResult.userSpecified( "uniqueUris" ); if ( uniqueURIs ) filter = new BloomFilter( jsapResult.getInt( "uniqueUris" ) ); final Collection collection; if ( termFile == null ) { ArrayList termList = new ArrayList(); final ProgressLogger pl = new ProgressLogger(); pl.itemsName = "URIs"; final LineIterator termIterator = new LineIterator( new FastBufferedReader( new InputStreamReader( System.in, "UTF-8" ), bufferSize ), pl ); pl.start( "Reading URIs..." ); MutableString uri; while( termIterator.hasNext() ) { uri = termIterator.next(); if ( uniqueURIs ) makeUnique( filter, uri ); termList.add( uri.copy() ); } pl.done(); collection = termList; } else { if ( uniqueURIs ) { // Create temporary file with unique URIs final ProgressLogger pl = new ProgressLogger(); pl.itemsName = "URIs"; pl.start( "Copying URIs..." ); final LineIterator termIterator = new LineIterator( new FastBufferedReader( new InputStreamReader( new FileInputStream( termFile ) ), bufferSize ), pl ); File temp = File.createTempFile( URLMPHVirtualDocumentResolver.class.getName(), ".uniqueuris" ); temp.deleteOnExit(); termFile = temp.toString(); final FastBufferedOutputStream outputStream = new FastBufferedOutputStream( new FileOutputStream( termFile ), bufferSize ); MutableString uri; while( termIterator.hasNext() ) { uri = termIterator.next(); makeUnique( filter, uri ); uri.writeUTF8( outputStream ); outputStream.write( '\n' ); } pl.done(); outputStream.close(); } collection = new FileLinesCollection( termFile, "UTF-8" ); } LOGGER.debug( "Building function..." ); final int width = jsapResult.getInt( "width" ); if ( jsapResult.getBoolean( "sorted" ) ) BinIO.storeObject( new URLMPHVirtualDocumentResolver( new ShiftAddXorSignedStringMap( collection.iterator(), new TwoStepsLcpMonotoneMinimalPerfectHashFunction( collection, iso ? TransformationStrategies.prefixFreeIso() : TransformationStrategies.prefixFreeUtf16() ), width ) ), resolverName ); else BinIO.storeObject( new URLMPHVirtualDocumentResolver( new ShiftAddXorSignedStringMap( collection.iterator(), new MWHCFunction( collection, iso ? TransformationStrategies.iso() : TransformationStrategies.utf16() ), width ) ), resolverName ); LOGGER.debug( " done." ); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy