All Downloads are FREE. Search and download functionalities are using the official Maven repository.

slow.it.unimi.dsi.big.mg4j.index.IndexSlowTest Maven / Gradle / Ivy

Go to download

MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections written in Java. The big version is a fork of the original MG4J that can handle more than 2^31 terms and documents.

The newest version!
package it.unimi.dsi.big.mg4j.index;

import static org.junit.Assert.assertEquals;
import it.unimi.dsi.Util;
import it.unimi.dsi.big.mg4j.document.AbstractDocument;
import it.unimi.dsi.big.mg4j.document.AbstractDocumentSequence;
import it.unimi.dsi.big.mg4j.document.Document;
import it.unimi.dsi.big.mg4j.document.DocumentFactory;
import it.unimi.dsi.big.mg4j.document.DocumentIterator;
import it.unimi.dsi.big.mg4j.document.DocumentSequence;
import it.unimi.dsi.big.mg4j.document.IdentityDocumentFactory;
import it.unimi.dsi.big.mg4j.index.CompressionFlags.Coding;
import it.unimi.dsi.big.mg4j.index.CompressionFlags.Component;
import it.unimi.dsi.big.mg4j.tool.IndexBuilder;
import it.unimi.dsi.big.mg4j.tool.IndexTest;
import it.unimi.dsi.fastutil.objects.Reference2ObjectOpenHashMap;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.WordReader;
import it.unimi.dsi.lang.MutableString;

import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.lang.reflect.InvocationTargetException;
import java.net.URISyntaxException;
import java.util.Map;

import org.apache.commons.configuration.ConfigurationException;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.filefilter.FileFilterUtils;
import org.apache.log4j.Level;
import org.junit.BeforeClass;
import org.junit.Test;

public class IndexSlowTest {
	static {
		Util.ensureLog4JIsConfigured( Level.INFO );
	}

	private static String basename;

	private final static class VerticalDocumentSequence extends AbstractDocumentSequence {
		private final static long NUMBER_OF_DOCUMENTS = ( 1L << 31 ) + 1000000;
		
		@Override
		public DocumentIterator iterator() throws IOException {
			return new DocumentIterator() {
				long i = -1;
				WordReader wordReader = new FastBufferedReader();
				Document document = new AbstractDocument() {
					@Override
					public WordReader wordReader( int field ) {
						return wordReader;
					}
					
					@Override
					public CharSequence uri() {
						return null;
					}
					
					@Override
					public CharSequence title() {
						return null;
					}
					
					@Override
					public Object content( int field ) throws IOException {
						return new StringReader( ( i & -i ) == i ? "0 1" : i % 10 == 9 ? "0 2" : "0" );
					}
				};
				@Override
				public Document nextDocument() throws IOException {
					if ( i == NUMBER_OF_DOCUMENTS - 1 ) return null;
					i++;
					return document;
				}

				@Override
				public void close() throws IOException {}
			};
		
		}

		@Override
		public DocumentFactory factory() {
			return new IdentityDocumentFactory();
		}
		
	};


	private final static class HorizontalDocumentSequence extends AbstractDocumentSequence {
		private final static long TARGET_NUMBER_OF_TERMS = ( 1L << 31 ) + 1000000;
		private final static long TERMS_PER_DOCUMENT = 10000;
		private static final long NUMBER_OF_DOCUMENTS = ( TARGET_NUMBER_OF_TERMS + TERMS_PER_DOCUMENT - 1 ) / TERMS_PER_DOCUMENT;
		
		@Override
		public DocumentIterator iterator() throws IOException {
			return new DocumentIterator() {
				long i = -1;
				WordReader wordReader = new FastBufferedReader();
				Document document = new AbstractDocument() {
					@Override
					public WordReader wordReader( int field ) {
						return wordReader;
					}
					
					@Override
					public CharSequence uri() {
						return null;
					}
					
					@Override
					public CharSequence title() {
						return null;
					}
					
					@Override
					public Object content( int field ) throws IOException {
						MutableString s = new MutableString(), d = new MutableString();
						for( int j = 0; j < TERMS_PER_DOCUMENT; j++ ) {
							d.setLength( 0 ).append( "0000000000" ).append( i * TERMS_PER_DOCUMENT + j );
							s.append( d.subSequence( d.length() - 10, d.length() ) ).append( ' ' ); 
						}
						return new FastBufferedReader( s );
					}
				};
				@Override
				public Document nextDocument() throws IOException {
					if ( i == NUMBER_OF_DOCUMENTS - 1 ) return null;
					i++;
					return document;
				}

				@Override
				public void close() throws IOException {}
			};
		
		}

		@Override
		public DocumentFactory factory() {
			return new IdentityDocumentFactory();
		}
		
	};

	@BeforeClass
	public static void setUp() throws IOException {
		basename = File.createTempFile( IndexSlowTest.class.getSimpleName(), "test" ).getCanonicalPath();
	}

	// ALERT @AfterClass
	public static void tearDown() throws IOException {
		for ( Object f : FileUtils.listFiles( new File( basename ).getParentFile(), FileFilterUtils.prefixFileFilter( IndexSlowTest.class.getSimpleName() ), null ) )
			( (File)f ).delete();
		if ( lastSequence != null ) lastSequence.close();
	}

	// We keep track of the last returned sequence to close it without cluttering the test code
	private static DocumentSequence lastSequence;

	public void testIndex( boolean interleaved, Map flags, int quantum, int height, TermProcessor termProcessor ) throws ConfigurationException, SecurityException, IOException, URISyntaxException, ClassNotFoundException,
			InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException {
		Index index;
		
		// Vanilla indexing
		new IndexBuilder( basename, new VerticalDocumentSequence() ).standardWriterFlags( flags ).termProcessor( termProcessor ).skipBufferSize( 1024 ).pasteBufferSize( 1024 ).interleaved( interleaved ).skips( quantum != 0 ).quantum( quantum )
				.height( height ).documentsPerBatch( 100000000 ).termMapClass( null ).run();
		index = Index.getInstance( basename + "-text?mapped=1" );
		assertEquals( VerticalDocumentSequence.NUMBER_OF_DOCUMENTS, index.numberOfDocuments );
		assertEquals( 3, index.numberOfTerms );
		IndexIterator documents0 = index.documents( 0 );
		IndexIterator documents1 = index.documents( 1 );
		IndexIterator documents2 = index.documents( 2 );
		for( long i = 0; i < VerticalDocumentSequence.NUMBER_OF_DOCUMENTS; i++ ) {
			assertEquals( i, documents0.nextDocument() );
			if ( index.hasCounts ) assertEquals( 1, documents0.count() );
			if ( index.hasPositions ) assertEquals( 0, documents0.positionArray()[ 0 ] );
			
			if ( ( i & -i ) == i ) {
				assertEquals( i, documents1.nextDocument() );
				if ( index.hasCounts ) assertEquals( 1, documents1.count() );
				if ( index.hasPositions ) assertEquals( 1, documents1.positionArray()[ 0 ] );
			}
			else if ( i % 10 == 9 ) {
				assertEquals( i, documents2.nextDocument() );
				if ( index.hasCounts ) assertEquals( 1, documents2.count() );
				if ( index.hasPositions ) assertEquals( 1, documents2.positionArray()[ 0 ] );
			}
		}

		documents0.dispose();
		documents1.dispose();

		new IndexBuilder( basename, new HorizontalDocumentSequence() ).standardWriterFlags( flags ).termProcessor( termProcessor ).skipBufferSize( 1024 ).pasteBufferSize( 1024 ).interleaved( interleaved ).skips( quantum != 0 ).quantum( quantum )
		.height( height ).documentsPerBatch( 100000000 ).termMapClass( null ).run();
		Index.getInstance( basename + "-text" );
		index = Index.getInstance( basename + "-text?mapped=1" );
		assertEquals( HorizontalDocumentSequence.NUMBER_OF_DOCUMENTS, index.numberOfDocuments );
		assertEquals( HorizontalDocumentSequence.NUMBER_OF_DOCUMENTS * HorizontalDocumentSequence.TERMS_PER_DOCUMENT, index.numberOfTerms );

		for( long i = 0; i < HorizontalDocumentSequence.TARGET_NUMBER_OF_TERMS; i++ ) {
			IndexIterator documents = index.documents( i );
			assertEquals( 1, documents.frequency() );
			assertEquals( i / HorizontalDocumentSequence.TERMS_PER_DOCUMENT, documents.nextDocument() );
			if ( index.hasCounts ) assertEquals( 1, documents.count() );
			if ( index.hasPositions ) assertEquals( i % HorizontalDocumentSequence.TERMS_PER_DOCUMENT, documents.positionArray()[ 0 ] );
			documents.dispose();
		}

	/*		final String basenameZipped = basename + "-zipped";
		if ( interleaved && flags.get( Component.POSITIONS ) != null ) flags.put( Component.POSITIONS, Coding.GOLOMB );
		// Vanilla indexing generating a zipped collection (we also use Golomb coding to test the usage of sizes in combinations).
		ZipDocumentCollectionBuilder zipBuilder = new ZipDocumentCollectionBuilder( basenameZipped, getSequence().factory(), true );
		new IndexBuilder( basename, getSequence() ).standardWriterFlags( flags ).termProcessor( termProcessor ).skipBufferSize( 1024 ).pasteBufferSize( 1024 ).interleaved( interleaved ).skips( quantum != 0 ).quantum( quantum )
				.height( height ).virtualDocumentResolver( 3, RESOLVER ).documentsPerBatch( 20 ).builder( zipBuilder ).run();
		// Vanilla indexing using the zipped collection
		new IndexBuilder( basenameZipped, AbstractDocumentSequence.load( basenameZipped + DocumentCollection.DEFAULT_EXTENSION ) ).standardWriterFlags( flags ).termProcessor( termProcessor ).indexedFields( 0, 1, 2, 3 ).skipBufferSize( 1024 )
				.pasteBufferSize( 1024 ).interleaved( interleaved ).skips( quantum != 0 ).quantum( quantum ).height( height ).virtualDocumentResolver( 3, RESOLVER ).documentsPerBatch( 20 ).run();

		// The two indices must be byte-by-byte identical (and we keep the zipped index for future
		// reference)
		sameIndex( basename + "-text", basenameZipped + "-text" );
		sameIndex( basename + "-int", basenameZipped + "-int", "batches" );
		sameIndex( basename + "-date", basenameZipped + "-date", "batches" );
		sameIndex( basename + "-virtual", basenameZipped + "-virtual", "batches" );

		final String basenameSimple = basename + "-simple";

		// Vanilla indexing generating a simple compressed collection
		SimpleCompressedDocumentCollectionBuilder simpleBuilder = new SimpleCompressedDocumentCollectionBuilder( basenameSimple, getSequence().factory(), true );
		new IndexBuilder( basename, getSequence() ).standardWriterFlags( flags ).termProcessor( termProcessor ).skipBufferSize( 1024 ).pasteBufferSize( 1024 ).interleaved( interleaved ).skips( quantum != 0 ).quantum( quantum )
				.height( height ).virtualDocumentResolver( 3, RESOLVER ).documentsPerBatch( 20 ).builder( simpleBuilder ).run();
		// Vanilla indexing using the simple compressed collection
		new IndexBuilder( basenameSimple, AbstractDocumentSequence.load( basenameSimple + DocumentCollection.DEFAULT_EXTENSION ) ).standardWriterFlags( flags ).termProcessor( termProcessor ).indexedFields( 0, 1, 2, 3 ).skipBufferSize( 1024 )
				.pasteBufferSize( 1024 ).interleaved( interleaved ).skips( quantum != 0 ).quantum( quantum ).height( height ).virtualDocumentResolver( 3, RESOLVER ).documentsPerBatch( 20 ).run();

		// The two indices must be byte-by-byte identical (and we keep the zipped index for future
		// reference)
		sameIndex( basename + "-text", basenameSimple + "-text" );
		sameIndex( basename + "-int", basenameSimple + "-int", "batches" );
		sameIndex( basename + "-date", basenameSimple + "-date", "batches" );
		sameIndex( basename + "-virtual", basenameSimple + "-virtual", "batches" );


		// Indexing with just one batch
		new IndexBuilder( basename + "-onebatch", getSequence() ).standardWriterFlags( flags ).termProcessor( termProcessor ).skipBufferSize( 1024 ).pasteBufferSize( 1024 ).interleaved( interleaved ).skips( quantum != 0 )
				.quantum( quantum ).height( height ).virtualDocumentResolver( 3, RESOLVER ).documentsPerBatch( NUMBER_OF_DOCUMENTS ).run();

		if ( quantum >= 0 ) {
			// The two indices must be byte-by-byte identical
			sameIndex( basename + "-text", basename + "-onebatch-text", "batches" );
			sameIndex( basename + "-int", basename + "-onebatch-int", "batches" );
			sameIndex( basename + "-date", basename + "-onebatch-date", "batches" );
			sameIndex( basename + "-virtual", basename + "-onebatch-virtual", "batches" );
		}
		else {
			// The two indices must have the same content, as a different division
			// in batches can lead to a different quantum estimate. 
			sameContent( basename + "-text", basename + "-onebatch-text" );
			sameContent( basename + "-int", basename + "-onebatch-int" );
			sameContent( basename + "-date", basename + "-onebatch-date" );
			sameContent( basename + "-virtual", basename + "-onebatch-virtual" );
		}*/
	}

	public void testIndex( boolean interleaved, int quantum, int height ) throws ConfigurationException, SecurityException, IOException, URISyntaxException, ClassNotFoundException, InstantiationException,
			IllegalAccessException, InvocationTargetException, NoSuchMethodException {
		testIndex( interleaved, IndexTest.defaultStandardIndex(), quantum, height, DowncaseTermProcessor.getInstance() );
	}

	public void testIndex( boolean interleaved, Map flags, int quantum, int height ) throws ConfigurationException, SecurityException, IOException, URISyntaxException, ClassNotFoundException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException {
		testIndex( interleaved, flags, quantum, height, DowncaseTermProcessor.getInstance() );
	}
	
	@Test
	public void testIndex() throws ConfigurationException, SecurityException, IOException, URISyntaxException, ClassNotFoundException, InstantiationException, IllegalAccessException,
			InvocationTargetException, NoSuchMethodException {

		final Reference2ObjectOpenHashMap flags = new Reference2ObjectOpenHashMap( IndexTest.defaultStandardIndex() );
		flags.remove( Component.POSITIONS );
//		testIndex( true, flags, 4, 4 );
//		testIndex( true, flags, -4, 4 );
		flags.remove( Component.COUNTS );
		testIndex( true, flags, 4, 4 );
		testIndex( true, flags, -4, 4 );
		
		testIndex( true, 0, 0 );
		testIndex( true, 1, 1 );
		testIndex( true, 4, 4 );
		testIndex( true, 8, 1 );
		testIndex( true, 8, 4 );
		testIndex( true, -4, 1 );
		testIndex( true, -4, 4 );
		testIndex( true, -16, 1 );
		testIndex( true, -16, 10 );

		testIndex( false, 1, 0 );
		testIndex( false, 1, 1 );
		testIndex( false, 4, 4 );
		testIndex( false, 8, 1 );
		testIndex( false, 8, 4 );
		testIndex( false, -4, 1 );
		testIndex( false, -4, 4 );
		testIndex( false, -16, 1 );
		testIndex( false, -16, 10 );
	}


	/*public void testPartitionConcatenate( boolean interleaved, Map flags, int quantum, int height ) throws Exception {
		// Vanilla indexing
		if ( interleaved && flags.get( Component.POSITIONS ) != null ) flags.put( Component.POSITIONS, Coding.GOLOMB );
		new IndexBuilder( basename, getSequence() ).standardWriterFlags( flags ).skipBufferSize( 1024 ).pasteBufferSize( 1024 ).interleaved( interleaved ).skips( quantum != 0 ).quantum( quantum ).height( height )
				.virtualDocumentResolver( 3, RESOLVER ).run();

		// We partition
		BinIO.storeObject( DocumentalStrategies.uniform( 3, NUMBER_OF_DOCUMENTS ), basename + "-strategy" );

		new PartitionDocumentally( basename + "-text", basename + "-text-part", DocumentalStrategies.uniform( 3, NUMBER_OF_DOCUMENTS ), basename + "-strategy", 0, 1024, flags, 
				interleaved, quantum != 0, Math.abs( quantum ), height, 1024 * 1024, DEFAULT_LOG_INTERVAL ).run();
		new PartitionDocumentally( basename + "-int", basename + "-int-part", DocumentalStrategies.uniform( 3, NUMBER_OF_DOCUMENTS ), basename + "-strategy", 0, 1024, DEFAULT_PAYLOAD_INDEX,
				interleaved, quantum != 0, Math.abs( quantum ), height, 1024 * 1024, DEFAULT_LOG_INTERVAL ).run();
		new PartitionDocumentally( basename + "-date", basename + "-date-part", DocumentalStrategies.uniform( 3, NUMBER_OF_DOCUMENTS ), basename + "-strategy", 0, 1024, DEFAULT_PAYLOAD_INDEX,
				interleaved, quantum != 0, Math.abs( quantum ), height, 1024 * 1024, DEFAULT_LOG_INTERVAL ).run();
		new PartitionDocumentally( basename + "-virtual", basename + "-virtual-part", DocumentalStrategies.uniform( 3, NUMBER_OF_DOCUMENTS ), basename + "-strategy", 0, 1024, flags,
				interleaved, quantum != 0, Math.abs( quantum ), height, 1024 * 1024, DEFAULT_LOG_INTERVAL ).run();

		// For the text part, we need term maps to call sameIndex()
		String[] localIndex = new Properties( basename + "-text-part" + PROPERTIES_EXTENSION ).getStringArray( IndexCluster.PropertyKeys.LOCALINDEX );
		for ( String index : localIndex ) BinIO.storeObject( createMap(index + TERMS_EXTENSION ), index + TERMMAP_EXTENSION );

		sameContent( basename + "-text", basename + "-text-part", new FileLinesCollection( basename + "-text" + TERMS_EXTENSION, "UTF-8" ).iterator() );

		sameContent( basename + "-int", basename + "-int-part" );
		sameContent( basename + "-date", basename + "-date-part" );

		localIndex = new Properties( basename + "-virtual-part" + PROPERTIES_EXTENSION ).getStringArray( IndexCluster.PropertyKeys.LOCALINDEX );
		for ( String index : localIndex )
			BinIO.storeObject( createMap( index + TERMS_EXTENSION ), index + TERMMAP_EXTENSION );

		sameContent( basename + "-virtual", basename + "-virtual-part", new FileLinesCollection( basename + "-virtual" + TERMS_EXTENSION, "UTF-8" ).iterator() );

		localIndex = new Properties( basename + "-text-part" + PROPERTIES_EXTENSION ).getStringArray( IndexCluster.PropertyKeys.LOCALINDEX );
		new Concatenate( basename + "-text-merged", localIndex, false, 1024, flags, interleaved, quantum != 0, quantum, height, 1024 * 1024, DEFAULT_LOG_INTERVAL ).run();
		if ( quantum >= 0 ) sameIndex( basename + "-text", basename + "-text-merged", "batches", flags.containsKey( Component.COUNTS ) ? "" : "occurrences" );
		sameContent( basename + "-text", basename + "-text-merged" );

		localIndex = new Properties( basename + "-int-part" + PROPERTIES_EXTENSION ).getStringArray( IndexCluster.PropertyKeys.LOCALINDEX );
		new Concatenate( basename + "-int-merged", localIndex, false, 1024, DEFAULT_PAYLOAD_INDEX, interleaved, quantum != 0, quantum, height, 1024 * 1024, DEFAULT_LOG_INTERVAL ).run();
		if ( quantum >= 0 ) sameIndex( basename + "-text", basename + "-text-merged", "batches", flags.containsKey( Component.COUNTS ) ? "" : "occurrences" );
		sameContent( basename + "-text", basename + "-text-merged" );

		localIndex = new Properties( basename + "-date-part" + PROPERTIES_EXTENSION ).getStringArray( IndexCluster.PropertyKeys.LOCALINDEX );
		new Concatenate( basename + "-date-merged", localIndex, false, 1024, DEFAULT_PAYLOAD_INDEX, interleaved, quantum != 0, quantum, height, 1024 * 1024, DEFAULT_LOG_INTERVAL ).run();
		if ( quantum >= 0 ) sameIndex( basename + "-text", basename + "-text-merged", "batches", flags.containsKey( Component.COUNTS ) ? "" : "occurrences" );
		sameContent( basename + "-text", basename + "-text-merged" );

		localIndex = new Properties( basename + "-virtual-part" + PROPERTIES_EXTENSION ).getStringArray( IndexCluster.PropertyKeys.LOCALINDEX );
		new Concatenate( basename + "-virtual-merged", localIndex, false, 1024, flags, interleaved, quantum != 0, quantum, height, 1024 * 1024, DEFAULT_LOG_INTERVAL ).run();
		if ( quantum >= 0 ) sameIndex( basename + "-text", basename + "-text-merged", "batches", flags.containsKey( Component.COUNTS ) ? "" : "occurrences" );
		sameContent( basename + "-text", basename + "-text-merged" );
	}

	public void testPartitionConcatenate() throws Exception {

		final Reference2ObjectOpenHashMap flags = new Reference2ObjectOpenHashMap( defaultStandardIndex() );
		flags.remove( Component.POSITIONS );
		testPartitionConcatenate( true, flags, 4, 4 );
		testPartitionConcatenate( true, flags, -4, 4 );
		flags.remove( Component.COUNTS );
		testPartitionConcatenate( true, flags, 4, 4 );
		testPartitionConcatenate( true, flags, -4, 4 );

		testPartitionConcatenate( true, defaultStandardIndex(), 0, 0 );
		testPartitionConcatenate( true, defaultStandardIndex(), 1, 1 );
		testPartitionConcatenate( true, defaultStandardIndex(), 1, 2 );
		testPartitionConcatenate( true, defaultStandardIndex(), 4, 1 );
		testPartitionConcatenate( true, defaultStandardIndex(), 4, 4 );
		testPartitionConcatenate( true, defaultStandardIndex(), 8, 1 );
		testPartitionConcatenate( true, defaultStandardIndex(), 8, 4 );
		testPartitionConcatenate( true, defaultStandardIndex(), -1, 1 );
		testPartitionConcatenate( true, defaultStandardIndex(), -1, 2 );
		testPartitionConcatenate( true, defaultStandardIndex(), -4, 1 );
		testPartitionConcatenate( true, defaultStandardIndex(), -4, 4 );
		testPartitionConcatenate( true, defaultStandardIndex(), -8, 1 );
		testPartitionConcatenate( true, defaultStandardIndex(), -8, 4 );

		testPartitionConcatenate( false, defaultStandardIndex(), 1, 0 );
		testPartitionConcatenate( false, defaultStandardIndex(), 1, 1 );
		testPartitionConcatenate( false, defaultStandardIndex(), 1, 2 );
		testPartitionConcatenate( false, defaultStandardIndex(), 4, 1 );
		testPartitionConcatenate( false, defaultStandardIndex(), 4, 4 );
		testPartitionConcatenate( false, defaultStandardIndex(), 8, 1 );
		testPartitionConcatenate( false, defaultStandardIndex(), 8, 4 );
		testPartitionConcatenate( false, defaultStandardIndex(), -1, 1 );
		testPartitionConcatenate( false, defaultStandardIndex(), -1, 2 );
		testPartitionConcatenate( false, defaultStandardIndex(), -4, 1 );
		testPartitionConcatenate( false, defaultStandardIndex(), -4, 4 );
		testPartitionConcatenate( false, defaultStandardIndex(), -8, 1 );
		testPartitionConcatenate( false, defaultStandardIndex(), -8, 4 );
	}


	public void testPartitionMerge( boolean interleaved, Map flags, int quantum, int height ) throws ConfigurationException, SecurityException, IOException, ClassNotFoundException, InstantiationException, IllegalAccessException,
			Exception {
		
		if ( interleaved && flags.get( Component.POSITIONS ) != null ) flags.put( Component.POSITIONS, Coding.GOLOMB );
		
		// Vanilla indexing
		new IndexBuilder( basename, getSequence() ).standardWriterFlags( flags ).skipBufferSize( 1024 ).pasteBufferSize( 1024 ).interleaved( interleaved ).skips( quantum != 0 ).quantum( quantum ).height( height )
				.virtualDocumentResolver( 3, RESOLVER ).run();

		// Now we use a crazy strategy moving around documents using modular arithmetic
		final DocumentalPartitioningStrategy modulo3 = new Modulo3DocumentalClusteringStrategy( NUMBER_OF_DOCUMENTS );
		BinIO.storeObject( modulo3, basename + "-strategy" );

		new PartitionDocumentally( basename + "-text", basename + "-text-part", modulo3, basename + "-strategy", 0, 1024, flags, interleaved, quantum != 0, Math.abs( quantum ), height,
				1024 * 1024, DEFAULT_LOG_INTERVAL ).run();
		new PartitionDocumentally( basename + "-int", basename + "-int-part", modulo3, basename + "-strategy", 0, 1024, DEFAULT_PAYLOAD_INDEX, interleaved, quantum != 0, Math.abs( quantum ), height, 1024 * 1024,
				DEFAULT_LOG_INTERVAL ).run();
		new PartitionDocumentally( basename + "-date", basename + "-date-part", modulo3, basename + "-strategy", 0, 1024, DEFAULT_PAYLOAD_INDEX, interleaved, quantum != 0, Math.abs( quantum ), height,
				1024 * 1024, DEFAULT_LOG_INTERVAL ).run();
		new PartitionDocumentally( basename + "-virtual", basename + "-virtual-part", modulo3, basename + "-strategy", 0, 1024, flags, interleaved, quantum != 0, Math.abs( quantum ), height,
				1024 * 1024, DEFAULT_LOG_INTERVAL ).run();

		String[] localIndex = new Properties( basename + "-text-part" + PROPERTIES_EXTENSION ).getStringArray( IndexCluster.PropertyKeys.LOCALINDEX );
		for ( String index : localIndex )
			BinIO.storeObject( createMap( index + TERMS_EXTENSION ), index + TERMMAP_EXTENSION );
		sameContent( basename + "-text", basename + "-text-part", new FileLinesCollection( basename + "-text" + TERMS_EXTENSION, "UTF-8" ).iterator() );

		sameContent( basename + "-int", basename + "-int-part" );
		sameContent( basename + "-date", basename + "-date-part" );

		localIndex = new Properties( basename + "-virtual-part" + PROPERTIES_EXTENSION ).getStringArray( IndexCluster.PropertyKeys.LOCALINDEX );
		for ( String index : localIndex )
			BinIO.storeObject( createMap( index + TERMS_EXTENSION ), index + TERMMAP_EXTENSION );
		sameContent( basename + "-virtual", basename + "-virtual-part", new FileLinesCollection( basename + "-virtual" + TERMS_EXTENSION, "UTF-8" ).iterator() );

		localIndex = new Properties( basename + "-text-part" + PROPERTIES_EXTENSION ).getStringArray( IndexCluster.PropertyKeys.LOCALINDEX );

		new Merge( basename + "-text-merged", localIndex, false, 1024, flags, interleaved, quantum != 0, quantum, height, 1024 * 1024, DEFAULT_LOG_INTERVAL ).run();
		if ( ! interleaved && quantum >= 0 ) sameIndex( basename + "-text", basename + "-text-merged", "batches" );
		else sameContent( basename + "-text", basename + "-text-merged" );
		localIndex = new Properties( basename + "-int-part" + PROPERTIES_EXTENSION ).getStringArray( IndexCluster.PropertyKeys.LOCALINDEX );
		new Merge( basename + "-int-merged", localIndex, false, 1024, DEFAULT_PAYLOAD_INDEX, interleaved, quantum != 0, quantum, height, 1024 * 1024, DEFAULT_LOG_INTERVAL ).run();
		if ( ! interleaved && quantum >= 0 ) sameIndex( basename + "-int", basename + "-int-merged", "batches" );
		else sameContent( basename + "-int", basename + "-int-merged" );
		localIndex = new Properties( basename + "-date-part" + PROPERTIES_EXTENSION ).getStringArray( IndexCluster.PropertyKeys.LOCALINDEX );
		new Merge( basename + "-date-merged", localIndex, false, 1024, DEFAULT_PAYLOAD_INDEX, interleaved, quantum != 0, quantum, height, 1024 * 1024, DEFAULT_LOG_INTERVAL ).run();
		if ( ! interleaved && quantum >= 0 ) sameIndex( basename + "-date", basename + "-date-merged", "batches" );
		else sameContent( basename + "-date", basename + "-date-merged" );
		localIndex = new Properties( basename + "-virtual-part" + PROPERTIES_EXTENSION ).getStringArray( IndexCluster.PropertyKeys.LOCALINDEX );
		new Merge( basename + "-virtual-merged", localIndex, false, 1024, flags, interleaved, quantum != 0, quantum, height, 1024 * 1024, DEFAULT_LOG_INTERVAL ).run();
		if ( ! interleaved && quantum >= 0 ) sameIndex( basename + "-virtual", basename + "-virtual-merged", "batches" );
		else sameContent( basename + "-virtual", basename + "-virtual-merged" );
	}

	public void testPartitionMerge() throws Exception {
		final Reference2ObjectOpenHashMap flags = new Reference2ObjectOpenHashMap( defaultStandardIndex() );
		flags.remove( Component.POSITIONS );
		testPartitionMerge( true, flags, 4, 4 );
		testPartitionMerge( true, flags, -4, 4 );
		flags.remove( Component.COUNTS );
		testPartitionMerge( true, flags, 4, 4 );
		testPartitionMerge( true, flags, -4, 4 );
		
		testPartitionMerge( true, defaultStandardIndex(), 0, 0 );
		testPartitionMerge( true, defaultStandardIndex(), 1, 1 );
		testPartitionMerge( true, defaultStandardIndex(), 1, 2 );
		testPartitionMerge( true, defaultStandardIndex(), 4, 1 );
		testPartitionMerge( true, defaultStandardIndex(), 4, 4 );
		testPartitionMerge( true, defaultStandardIndex(), 8, 1 );
		testPartitionMerge( true, defaultStandardIndex(), 8, 4 );
		testPartitionMerge( true, defaultStandardIndex(), -1, 1 );
		testPartitionMerge( true, defaultStandardIndex(), -1, 2 );
		testPartitionMerge( true, defaultStandardIndex(), -4, 1 );
		testPartitionMerge( true, defaultStandardIndex(), -4, 4 );
		testPartitionMerge( true, defaultStandardIndex(), -8, 1 );
		testPartitionMerge( true, defaultStandardIndex(), -8, 4 );

		testPartitionMerge( false, defaultStandardIndex(), 1, 0 );
		testPartitionMerge( false, defaultStandardIndex(), 1, 1 );
		testPartitionMerge( false, defaultStandardIndex(), 1, 2 );
		testPartitionMerge( false, defaultStandardIndex(), 4, 1 );
		testPartitionMerge( false, defaultStandardIndex(), 4, 4 );
		testPartitionMerge( false, defaultStandardIndex(), 8, 1 );
		testPartitionMerge( false, defaultStandardIndex(), 8, 4 );
		testPartitionMerge( false, defaultStandardIndex(), -1, 1 );
		testPartitionMerge( false, defaultStandardIndex(), -1, 2 );
		testPartitionMerge( false, defaultStandardIndex(), -4, 1 );
		testPartitionMerge( false, defaultStandardIndex(), -4, 4 );
		testPartitionMerge( false, defaultStandardIndex(), -8, 1 );
		testPartitionMerge( false, defaultStandardIndex(), -8, 4 );
	}

	public void testLexicalPartitioning( boolean interleaved, Map flags ) throws ConfigurationException, SecurityException, IOException, ClassNotFoundException, InstantiationException, IllegalAccessException,
			Exception {
		// Vanilla indexing
		new IndexBuilder( basename, getSequence() ).standardWriterFlags( flags ).interleaved( interleaved ).skipBufferSize( 1024 ).pasteBufferSize( 1024 ).virtualDocumentResolver( 3, RESOLVER ).run();

		// Now we use a crazy strategy moving around documents using modular arithmetic
		final LexicalPartitioningStrategy uniform = LexicalStrategies.uniform( 3, DiskBasedIndex.getInstance( basename + "-text" ) );
		BinIO.storeObject( uniform, basename + "-strategy" );

		new PartitionLexically( basename + "-text", basename + "-text-part", uniform, basename + "-strategy", 1024, DEFAULT_LOG_INTERVAL ).run();
		new PartitionLexically( basename + "-virtual", basename + "-virtual-part", uniform, basename + "-strategy", 1024, DEFAULT_LOG_INTERVAL ).run();

		String[] localIndex = new Properties( basename + "-text-part" + PROPERTIES_EXTENSION ).getStringArray( IndexCluster.PropertyKeys.LOCALINDEX );
		for ( String index : localIndex )
			BinIO.storeObject( createMap( index + TERMS_EXTENSION ), index + TERMMAP_EXTENSION );
		sameContent( basename + "-text", basename + "-text-part", new FileLinesCollection( basename + "-text" + TERMS_EXTENSION, "UTF-8" ).iterator() );
		sameContent( basename + "-virtual", basename + "-virtual-part" );
	}

	public void testLexicalPartitioning() throws ConfigurationException, SecurityException, IOException, ClassNotFoundException, InstantiationException, IllegalAccessException, Exception {
		testLexicalPartitioning( true, defaultStandardIndex() );
		testLexicalPartitioning( false, defaultStandardIndex() );
		Reference2ObjectOpenHashMap flags = new Reference2ObjectOpenHashMap( defaultStandardIndex() );
		flags.remove( Component.POSITIONS );
		testLexicalPartitioning( true, flags );
		flags.remove( Component.COUNTS );
		testLexicalPartitioning( true, flags );
	}

	public void testEmpty( boolean interleaved, Map flags, int quantum, int height ) throws ConfigurationException, SecurityException, IOException, URISyntaxException, ClassNotFoundException, InstantiationException,
			IllegalAccessException, InvocationTargetException, NoSuchMethodException {
		// Vanilla indexing
		new IndexBuilder( basename, getEmptySequence() ).standardWriterFlags( flags ).skipBufferSize( 1024 ).pasteBufferSize( 1024 ).interleaved( interleaved ).skips( quantum != 0 ).quantum( quantum ).height( height )
				.virtualDocumentResolver( 3, RESOLVER ).documentsPerBatch( 20 ).run();
		checkAgainstContent( getEmptySequence(), null, RESOLVER, Scan.DEFAULT_VIRTUAL_DOCUMENT_GAP, Index.getInstance( basename + "-text" ), Index.getInstance( basename + "-int" ), Index
				.getInstance( basename + "-date" ), Index.getInstance( basename + "-virtual" ) );

		// Permuted indexing
		String mapFile = File.createTempFile( IndexSlowTest.class.getSimpleName(), "permutation" ).toString();
		new IndexBuilder( basename + "-mapped", getEmptySequence() ).standardWriterFlags( flags ).skipBufferSize( 1024 ).pasteBufferSize( 1024 ).interleaved( interleaved ).skips( quantum != 0 ).quantum( quantum ).height( height )
				.virtualDocumentResolver( 3, RESOLVER ).mapFile( mapFile ).documentsPerBatch( 20 ).run();

		sameIndex( basename + "-text", basename + "-mapped-text" );
		sameIndex( basename + "-int", basename + "-mapped-int" );
		sameIndex( basename + "-date", basename + "-mapped-date" );
		sameIndex( basename + "-virtual", basename + "-mapped-virtual" );
	}

	public void testEmpty() throws Exception {
		final Reference2ObjectOpenHashMap flags = new Reference2ObjectOpenHashMap( defaultStandardIndex() );
		flags.remove( Component.POSITIONS );
		testEmpty( true, flags, 4, 4 );
		testEmpty( true, flags, -4, 4 );
		flags.remove( Component.COUNTS );
		testEmpty( true, flags, 4, 4 );
		testEmpty( true, flags, -4, 4 );


		testEmpty( true, defaultStandardIndex(), 0, 0 );
		testEmpty( true, defaultStandardIndex(), 1, 1 );
		testEmpty( true, defaultStandardIndex(), 1, 2 );
		testEmpty( true, defaultStandardIndex(), 4, 1 );
		testEmpty( true, defaultStandardIndex(), 4, 4 );
		testEmpty( true, defaultStandardIndex(), 8, 1 );
		testEmpty( true, defaultStandardIndex(), 8, 4 );
		testEmpty( true, defaultStandardIndex(), -1, 1 );
		testEmpty( true, defaultStandardIndex(), -1, 2 );
		testEmpty( true, defaultStandardIndex(), -8, 1 );
		testEmpty( true, defaultStandardIndex(), -8, 4 );
		testEmpty( true, defaultStandardIndex(), -8, 1 );
		testEmpty( true, defaultStandardIndex(), -8, 4 );

		testEmpty( false, defaultStandardIndex(), 1, 0 );
		testEmpty( false, defaultStandardIndex(), 1, 1 );
		testEmpty( false, defaultStandardIndex(), 1, 2 );
		testEmpty( false, defaultStandardIndex(), 4, 1 );
		testEmpty( false, defaultStandardIndex(), 4, 4 );
		testEmpty( false, defaultStandardIndex(), 8, 1 );
		testEmpty( false, defaultStandardIndex(), 8, 4 );
		testEmpty( false, defaultStandardIndex(), -1, 1 );
		testEmpty( false, defaultStandardIndex(), -1, 2 );
		testEmpty( false, defaultStandardIndex(), -8, 1 );
		testEmpty( false, defaultStandardIndex(), -8, 4 );
		testEmpty( false, defaultStandardIndex(), -8, 1 );
		testEmpty( false, defaultStandardIndex(), -8, 4 );
	}
*/
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy