![JAR search and dependency download from the Maven repository](/logo.png)
slow.it.unimi.dsi.big.mg4j.index.IndexSlowTest Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of mg4j-big Show documentation
Show all versions of mg4j-big Show documentation
MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections written in Java. The big version is a fork of the original MG4J that can handle more than 2^31 terms and documents.
The newest version!
package it.unimi.dsi.big.mg4j.index;
import static org.junit.Assert.assertEquals;
import it.unimi.dsi.Util;
import it.unimi.dsi.big.mg4j.document.AbstractDocument;
import it.unimi.dsi.big.mg4j.document.AbstractDocumentSequence;
import it.unimi.dsi.big.mg4j.document.Document;
import it.unimi.dsi.big.mg4j.document.DocumentFactory;
import it.unimi.dsi.big.mg4j.document.DocumentIterator;
import it.unimi.dsi.big.mg4j.document.DocumentSequence;
import it.unimi.dsi.big.mg4j.document.IdentityDocumentFactory;
import it.unimi.dsi.big.mg4j.index.CompressionFlags.Coding;
import it.unimi.dsi.big.mg4j.index.CompressionFlags.Component;
import it.unimi.dsi.big.mg4j.tool.IndexBuilder;
import it.unimi.dsi.big.mg4j.tool.IndexTest;
import it.unimi.dsi.fastutil.objects.Reference2ObjectOpenHashMap;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.WordReader;
import it.unimi.dsi.lang.MutableString;
import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.lang.reflect.InvocationTargetException;
import java.net.URISyntaxException;
import java.util.Map;
import org.apache.commons.configuration.ConfigurationException;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.filefilter.FileFilterUtils;
import org.apache.log4j.Level;
import org.junit.BeforeClass;
import org.junit.Test;
public class IndexSlowTest {
static {
Util.ensureLog4JIsConfigured( Level.INFO );
}
private static String basename;
private final static class VerticalDocumentSequence extends AbstractDocumentSequence {
private final static long NUMBER_OF_DOCUMENTS = ( 1L << 31 ) + 1000000;
@Override
public DocumentIterator iterator() throws IOException {
return new DocumentIterator() {
long i = -1;
WordReader wordReader = new FastBufferedReader();
Document document = new AbstractDocument() {
@Override
public WordReader wordReader( int field ) {
return wordReader;
}
@Override
public CharSequence uri() {
return null;
}
@Override
public CharSequence title() {
return null;
}
@Override
public Object content( int field ) throws IOException {
return new StringReader( ( i & -i ) == i ? "0 1" : i % 10 == 9 ? "0 2" : "0" );
}
};
@Override
public Document nextDocument() throws IOException {
if ( i == NUMBER_OF_DOCUMENTS - 1 ) return null;
i++;
return document;
}
@Override
public void close() throws IOException {}
};
}
@Override
public DocumentFactory factory() {
return new IdentityDocumentFactory();
}
};
private final static class HorizontalDocumentSequence extends AbstractDocumentSequence {
private final static long TARGET_NUMBER_OF_TERMS = ( 1L << 31 ) + 1000000;
private final static long TERMS_PER_DOCUMENT = 10000;
private static final long NUMBER_OF_DOCUMENTS = ( TARGET_NUMBER_OF_TERMS + TERMS_PER_DOCUMENT - 1 ) / TERMS_PER_DOCUMENT;
@Override
public DocumentIterator iterator() throws IOException {
return new DocumentIterator() {
long i = -1;
WordReader wordReader = new FastBufferedReader();
Document document = new AbstractDocument() {
@Override
public WordReader wordReader( int field ) {
return wordReader;
}
@Override
public CharSequence uri() {
return null;
}
@Override
public CharSequence title() {
return null;
}
@Override
public Object content( int field ) throws IOException {
MutableString s = new MutableString(), d = new MutableString();
for( int j = 0; j < TERMS_PER_DOCUMENT; j++ ) {
d.setLength( 0 ).append( "0000000000" ).append( i * TERMS_PER_DOCUMENT + j );
s.append( d.subSequence( d.length() - 10, d.length() ) ).append( ' ' );
}
return new FastBufferedReader( s );
}
};
@Override
public Document nextDocument() throws IOException {
if ( i == NUMBER_OF_DOCUMENTS - 1 ) return null;
i++;
return document;
}
@Override
public void close() throws IOException {}
};
}
@Override
public DocumentFactory factory() {
return new IdentityDocumentFactory();
}
};
@BeforeClass
public static void setUp() throws IOException {
basename = File.createTempFile( IndexSlowTest.class.getSimpleName(), "test" ).getCanonicalPath();
}
// ALERT @AfterClass
public static void tearDown() throws IOException {
for ( Object f : FileUtils.listFiles( new File( basename ).getParentFile(), FileFilterUtils.prefixFileFilter( IndexSlowTest.class.getSimpleName() ), null ) )
( (File)f ).delete();
if ( lastSequence != null ) lastSequence.close();
}
// We keep track of the last returned sequence to close it without cluttering the test code
private static DocumentSequence lastSequence;
public void testIndex( boolean interleaved, Map flags, int quantum, int height, TermProcessor termProcessor ) throws ConfigurationException, SecurityException, IOException, URISyntaxException, ClassNotFoundException,
InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException {
Index index;
// Vanilla indexing
new IndexBuilder( basename, new VerticalDocumentSequence() ).standardWriterFlags( flags ).termProcessor( termProcessor ).skipBufferSize( 1024 ).pasteBufferSize( 1024 ).interleaved( interleaved ).skips( quantum != 0 ).quantum( quantum )
.height( height ).documentsPerBatch( 100000000 ).termMapClass( null ).run();
index = Index.getInstance( basename + "-text?mapped=1" );
assertEquals( VerticalDocumentSequence.NUMBER_OF_DOCUMENTS, index.numberOfDocuments );
assertEquals( 3, index.numberOfTerms );
IndexIterator documents0 = index.documents( 0 );
IndexIterator documents1 = index.documents( 1 );
IndexIterator documents2 = index.documents( 2 );
for( long i = 0; i < VerticalDocumentSequence.NUMBER_OF_DOCUMENTS; i++ ) {
assertEquals( i, documents0.nextDocument() );
if ( index.hasCounts ) assertEquals( 1, documents0.count() );
if ( index.hasPositions ) assertEquals( 0, documents0.positionArray()[ 0 ] );
if ( ( i & -i ) == i ) {
assertEquals( i, documents1.nextDocument() );
if ( index.hasCounts ) assertEquals( 1, documents1.count() );
if ( index.hasPositions ) assertEquals( 1, documents1.positionArray()[ 0 ] );
}
else if ( i % 10 == 9 ) {
assertEquals( i, documents2.nextDocument() );
if ( index.hasCounts ) assertEquals( 1, documents2.count() );
if ( index.hasPositions ) assertEquals( 1, documents2.positionArray()[ 0 ] );
}
}
documents0.dispose();
documents1.dispose();
new IndexBuilder( basename, new HorizontalDocumentSequence() ).standardWriterFlags( flags ).termProcessor( termProcessor ).skipBufferSize( 1024 ).pasteBufferSize( 1024 ).interleaved( interleaved ).skips( quantum != 0 ).quantum( quantum )
.height( height ).documentsPerBatch( 100000000 ).termMapClass( null ).run();
Index.getInstance( basename + "-text" );
index = Index.getInstance( basename + "-text?mapped=1" );
assertEquals( HorizontalDocumentSequence.NUMBER_OF_DOCUMENTS, index.numberOfDocuments );
assertEquals( HorizontalDocumentSequence.NUMBER_OF_DOCUMENTS * HorizontalDocumentSequence.TERMS_PER_DOCUMENT, index.numberOfTerms );
for( long i = 0; i < HorizontalDocumentSequence.TARGET_NUMBER_OF_TERMS; i++ ) {
IndexIterator documents = index.documents( i );
assertEquals( 1, documents.frequency() );
assertEquals( i / HorizontalDocumentSequence.TERMS_PER_DOCUMENT, documents.nextDocument() );
if ( index.hasCounts ) assertEquals( 1, documents.count() );
if ( index.hasPositions ) assertEquals( i % HorizontalDocumentSequence.TERMS_PER_DOCUMENT, documents.positionArray()[ 0 ] );
documents.dispose();
}
/* final String basenameZipped = basename + "-zipped";
if ( interleaved && flags.get( Component.POSITIONS ) != null ) flags.put( Component.POSITIONS, Coding.GOLOMB );
// Vanilla indexing generating a zipped collection (we also use Golomb coding to test the usage of sizes in combinations).
ZipDocumentCollectionBuilder zipBuilder = new ZipDocumentCollectionBuilder( basenameZipped, getSequence().factory(), true );
new IndexBuilder( basename, getSequence() ).standardWriterFlags( flags ).termProcessor( termProcessor ).skipBufferSize( 1024 ).pasteBufferSize( 1024 ).interleaved( interleaved ).skips( quantum != 0 ).quantum( quantum )
.height( height ).virtualDocumentResolver( 3, RESOLVER ).documentsPerBatch( 20 ).builder( zipBuilder ).run();
// Vanilla indexing using the zipped collection
new IndexBuilder( basenameZipped, AbstractDocumentSequence.load( basenameZipped + DocumentCollection.DEFAULT_EXTENSION ) ).standardWriterFlags( flags ).termProcessor( termProcessor ).indexedFields( 0, 1, 2, 3 ).skipBufferSize( 1024 )
.pasteBufferSize( 1024 ).interleaved( interleaved ).skips( quantum != 0 ).quantum( quantum ).height( height ).virtualDocumentResolver( 3, RESOLVER ).documentsPerBatch( 20 ).run();
// The two indices must be byte-by-byte identical (and we keep the zipped index for future
// reference)
sameIndex( basename + "-text", basenameZipped + "-text" );
sameIndex( basename + "-int", basenameZipped + "-int", "batches" );
sameIndex( basename + "-date", basenameZipped + "-date", "batches" );
sameIndex( basename + "-virtual", basenameZipped + "-virtual", "batches" );
final String basenameSimple = basename + "-simple";
// Vanilla indexing generating a simple compressed collection
SimpleCompressedDocumentCollectionBuilder simpleBuilder = new SimpleCompressedDocumentCollectionBuilder( basenameSimple, getSequence().factory(), true );
new IndexBuilder( basename, getSequence() ).standardWriterFlags( flags ).termProcessor( termProcessor ).skipBufferSize( 1024 ).pasteBufferSize( 1024 ).interleaved( interleaved ).skips( quantum != 0 ).quantum( quantum )
.height( height ).virtualDocumentResolver( 3, RESOLVER ).documentsPerBatch( 20 ).builder( simpleBuilder ).run();
// Vanilla indexing using the simple compressed collection
new IndexBuilder( basenameSimple, AbstractDocumentSequence.load( basenameSimple + DocumentCollection.DEFAULT_EXTENSION ) ).standardWriterFlags( flags ).termProcessor( termProcessor ).indexedFields( 0, 1, 2, 3 ).skipBufferSize( 1024 )
.pasteBufferSize( 1024 ).interleaved( interleaved ).skips( quantum != 0 ).quantum( quantum ).height( height ).virtualDocumentResolver( 3, RESOLVER ).documentsPerBatch( 20 ).run();
// The two indices must be byte-by-byte identical (and we keep the zipped index for future
// reference)
sameIndex( basename + "-text", basenameSimple + "-text" );
sameIndex( basename + "-int", basenameSimple + "-int", "batches" );
sameIndex( basename + "-date", basenameSimple + "-date", "batches" );
sameIndex( basename + "-virtual", basenameSimple + "-virtual", "batches" );
// Indexing with just one batch
new IndexBuilder( basename + "-onebatch", getSequence() ).standardWriterFlags( flags ).termProcessor( termProcessor ).skipBufferSize( 1024 ).pasteBufferSize( 1024 ).interleaved( interleaved ).skips( quantum != 0 )
.quantum( quantum ).height( height ).virtualDocumentResolver( 3, RESOLVER ).documentsPerBatch( NUMBER_OF_DOCUMENTS ).run();
if ( quantum >= 0 ) {
// The two indices must be byte-by-byte identical
sameIndex( basename + "-text", basename + "-onebatch-text", "batches" );
sameIndex( basename + "-int", basename + "-onebatch-int", "batches" );
sameIndex( basename + "-date", basename + "-onebatch-date", "batches" );
sameIndex( basename + "-virtual", basename + "-onebatch-virtual", "batches" );
}
else {
// The two indices must have the same content, as a different division
// in batches can lead to a different quantum estimate.
sameContent( basename + "-text", basename + "-onebatch-text" );
sameContent( basename + "-int", basename + "-onebatch-int" );
sameContent( basename + "-date", basename + "-onebatch-date" );
sameContent( basename + "-virtual", basename + "-onebatch-virtual" );
}*/
}
public void testIndex( boolean interleaved, int quantum, int height ) throws ConfigurationException, SecurityException, IOException, URISyntaxException, ClassNotFoundException, InstantiationException,
IllegalAccessException, InvocationTargetException, NoSuchMethodException {
testIndex( interleaved, IndexTest.defaultStandardIndex(), quantum, height, DowncaseTermProcessor.getInstance() );
}
public void testIndex( boolean interleaved, Map flags, int quantum, int height ) throws ConfigurationException, SecurityException, IOException, URISyntaxException, ClassNotFoundException, InstantiationException, IllegalAccessException, InvocationTargetException, NoSuchMethodException {
testIndex( interleaved, flags, quantum, height, DowncaseTermProcessor.getInstance() );
}
@Test
public void testIndex() throws ConfigurationException, SecurityException, IOException, URISyntaxException, ClassNotFoundException, InstantiationException, IllegalAccessException,
InvocationTargetException, NoSuchMethodException {
final Reference2ObjectOpenHashMap flags = new Reference2ObjectOpenHashMap( IndexTest.defaultStandardIndex() );
flags.remove( Component.POSITIONS );
// testIndex( true, flags, 4, 4 );
// testIndex( true, flags, -4, 4 );
flags.remove( Component.COUNTS );
testIndex( true, flags, 4, 4 );
testIndex( true, flags, -4, 4 );
testIndex( true, 0, 0 );
testIndex( true, 1, 1 );
testIndex( true, 4, 4 );
testIndex( true, 8, 1 );
testIndex( true, 8, 4 );
testIndex( true, -4, 1 );
testIndex( true, -4, 4 );
testIndex( true, -16, 1 );
testIndex( true, -16, 10 );
testIndex( false, 1, 0 );
testIndex( false, 1, 1 );
testIndex( false, 4, 4 );
testIndex( false, 8, 1 );
testIndex( false, 8, 4 );
testIndex( false, -4, 1 );
testIndex( false, -4, 4 );
testIndex( false, -16, 1 );
testIndex( false, -16, 10 );
}
/*public void testPartitionConcatenate( boolean interleaved, Map flags, int quantum, int height ) throws Exception {
// Vanilla indexing
if ( interleaved && flags.get( Component.POSITIONS ) != null ) flags.put( Component.POSITIONS, Coding.GOLOMB );
new IndexBuilder( basename, getSequence() ).standardWriterFlags( flags ).skipBufferSize( 1024 ).pasteBufferSize( 1024 ).interleaved( interleaved ).skips( quantum != 0 ).quantum( quantum ).height( height )
.virtualDocumentResolver( 3, RESOLVER ).run();
// We partition
BinIO.storeObject( DocumentalStrategies.uniform( 3, NUMBER_OF_DOCUMENTS ), basename + "-strategy" );
new PartitionDocumentally( basename + "-text", basename + "-text-part", DocumentalStrategies.uniform( 3, NUMBER_OF_DOCUMENTS ), basename + "-strategy", 0, 1024, flags,
interleaved, quantum != 0, Math.abs( quantum ), height, 1024 * 1024, DEFAULT_LOG_INTERVAL ).run();
new PartitionDocumentally( basename + "-int", basename + "-int-part", DocumentalStrategies.uniform( 3, NUMBER_OF_DOCUMENTS ), basename + "-strategy", 0, 1024, DEFAULT_PAYLOAD_INDEX,
interleaved, quantum != 0, Math.abs( quantum ), height, 1024 * 1024, DEFAULT_LOG_INTERVAL ).run();
new PartitionDocumentally( basename + "-date", basename + "-date-part", DocumentalStrategies.uniform( 3, NUMBER_OF_DOCUMENTS ), basename + "-strategy", 0, 1024, DEFAULT_PAYLOAD_INDEX,
interleaved, quantum != 0, Math.abs( quantum ), height, 1024 * 1024, DEFAULT_LOG_INTERVAL ).run();
new PartitionDocumentally( basename + "-virtual", basename + "-virtual-part", DocumentalStrategies.uniform( 3, NUMBER_OF_DOCUMENTS ), basename + "-strategy", 0, 1024, flags,
interleaved, quantum != 0, Math.abs( quantum ), height, 1024 * 1024, DEFAULT_LOG_INTERVAL ).run();
// For the text part, we need term maps to call sameIndex()
String[] localIndex = new Properties( basename + "-text-part" + PROPERTIES_EXTENSION ).getStringArray( IndexCluster.PropertyKeys.LOCALINDEX );
for ( String index : localIndex ) BinIO.storeObject( createMap(index + TERMS_EXTENSION ), index + TERMMAP_EXTENSION );
sameContent( basename + "-text", basename + "-text-part", new FileLinesCollection( basename + "-text" + TERMS_EXTENSION, "UTF-8" ).iterator() );
sameContent( basename + "-int", basename + "-int-part" );
sameContent( basename + "-date", basename + "-date-part" );
localIndex = new Properties( basename + "-virtual-part" + PROPERTIES_EXTENSION ).getStringArray( IndexCluster.PropertyKeys.LOCALINDEX );
for ( String index : localIndex )
BinIO.storeObject( createMap( index + TERMS_EXTENSION ), index + TERMMAP_EXTENSION );
sameContent( basename + "-virtual", basename + "-virtual-part", new FileLinesCollection( basename + "-virtual" + TERMS_EXTENSION, "UTF-8" ).iterator() );
localIndex = new Properties( basename + "-text-part" + PROPERTIES_EXTENSION ).getStringArray( IndexCluster.PropertyKeys.LOCALINDEX );
new Concatenate( basename + "-text-merged", localIndex, false, 1024, flags, interleaved, quantum != 0, quantum, height, 1024 * 1024, DEFAULT_LOG_INTERVAL ).run();
if ( quantum >= 0 ) sameIndex( basename + "-text", basename + "-text-merged", "batches", flags.containsKey( Component.COUNTS ) ? "" : "occurrences" );
sameContent( basename + "-text", basename + "-text-merged" );
localIndex = new Properties( basename + "-int-part" + PROPERTIES_EXTENSION ).getStringArray( IndexCluster.PropertyKeys.LOCALINDEX );
new Concatenate( basename + "-int-merged", localIndex, false, 1024, DEFAULT_PAYLOAD_INDEX, interleaved, quantum != 0, quantum, height, 1024 * 1024, DEFAULT_LOG_INTERVAL ).run();
if ( quantum >= 0 ) sameIndex( basename + "-text", basename + "-text-merged", "batches", flags.containsKey( Component.COUNTS ) ? "" : "occurrences" );
sameContent( basename + "-text", basename + "-text-merged" );
localIndex = new Properties( basename + "-date-part" + PROPERTIES_EXTENSION ).getStringArray( IndexCluster.PropertyKeys.LOCALINDEX );
new Concatenate( basename + "-date-merged", localIndex, false, 1024, DEFAULT_PAYLOAD_INDEX, interleaved, quantum != 0, quantum, height, 1024 * 1024, DEFAULT_LOG_INTERVAL ).run();
if ( quantum >= 0 ) sameIndex( basename + "-text", basename + "-text-merged", "batches", flags.containsKey( Component.COUNTS ) ? "" : "occurrences" );
sameContent( basename + "-text", basename + "-text-merged" );
localIndex = new Properties( basename + "-virtual-part" + PROPERTIES_EXTENSION ).getStringArray( IndexCluster.PropertyKeys.LOCALINDEX );
new Concatenate( basename + "-virtual-merged", localIndex, false, 1024, flags, interleaved, quantum != 0, quantum, height, 1024 * 1024, DEFAULT_LOG_INTERVAL ).run();
if ( quantum >= 0 ) sameIndex( basename + "-text", basename + "-text-merged", "batches", flags.containsKey( Component.COUNTS ) ? "" : "occurrences" );
sameContent( basename + "-text", basename + "-text-merged" );
}
public void testPartitionConcatenate() throws Exception {
final Reference2ObjectOpenHashMap flags = new Reference2ObjectOpenHashMap( defaultStandardIndex() );
flags.remove( Component.POSITIONS );
testPartitionConcatenate( true, flags, 4, 4 );
testPartitionConcatenate( true, flags, -4, 4 );
flags.remove( Component.COUNTS );
testPartitionConcatenate( true, flags, 4, 4 );
testPartitionConcatenate( true, flags, -4, 4 );
testPartitionConcatenate( true, defaultStandardIndex(), 0, 0 );
testPartitionConcatenate( true, defaultStandardIndex(), 1, 1 );
testPartitionConcatenate( true, defaultStandardIndex(), 1, 2 );
testPartitionConcatenate( true, defaultStandardIndex(), 4, 1 );
testPartitionConcatenate( true, defaultStandardIndex(), 4, 4 );
testPartitionConcatenate( true, defaultStandardIndex(), 8, 1 );
testPartitionConcatenate( true, defaultStandardIndex(), 8, 4 );
testPartitionConcatenate( true, defaultStandardIndex(), -1, 1 );
testPartitionConcatenate( true, defaultStandardIndex(), -1, 2 );
testPartitionConcatenate( true, defaultStandardIndex(), -4, 1 );
testPartitionConcatenate( true, defaultStandardIndex(), -4, 4 );
testPartitionConcatenate( true, defaultStandardIndex(), -8, 1 );
testPartitionConcatenate( true, defaultStandardIndex(), -8, 4 );
testPartitionConcatenate( false, defaultStandardIndex(), 1, 0 );
testPartitionConcatenate( false, defaultStandardIndex(), 1, 1 );
testPartitionConcatenate( false, defaultStandardIndex(), 1, 2 );
testPartitionConcatenate( false, defaultStandardIndex(), 4, 1 );
testPartitionConcatenate( false, defaultStandardIndex(), 4, 4 );
testPartitionConcatenate( false, defaultStandardIndex(), 8, 1 );
testPartitionConcatenate( false, defaultStandardIndex(), 8, 4 );
testPartitionConcatenate( false, defaultStandardIndex(), -1, 1 );
testPartitionConcatenate( false, defaultStandardIndex(), -1, 2 );
testPartitionConcatenate( false, defaultStandardIndex(), -4, 1 );
testPartitionConcatenate( false, defaultStandardIndex(), -4, 4 );
testPartitionConcatenate( false, defaultStandardIndex(), -8, 1 );
testPartitionConcatenate( false, defaultStandardIndex(), -8, 4 );
}
public void testPartitionMerge( boolean interleaved, Map flags, int quantum, int height ) throws ConfigurationException, SecurityException, IOException, ClassNotFoundException, InstantiationException, IllegalAccessException,
Exception {
if ( interleaved && flags.get( Component.POSITIONS ) != null ) flags.put( Component.POSITIONS, Coding.GOLOMB );
// Vanilla indexing
new IndexBuilder( basename, getSequence() ).standardWriterFlags( flags ).skipBufferSize( 1024 ).pasteBufferSize( 1024 ).interleaved( interleaved ).skips( quantum != 0 ).quantum( quantum ).height( height )
.virtualDocumentResolver( 3, RESOLVER ).run();
// Now we use a crazy strategy moving around documents using modular arithmetic
final DocumentalPartitioningStrategy modulo3 = new Modulo3DocumentalClusteringStrategy( NUMBER_OF_DOCUMENTS );
BinIO.storeObject( modulo3, basename + "-strategy" );
new PartitionDocumentally( basename + "-text", basename + "-text-part", modulo3, basename + "-strategy", 0, 1024, flags, interleaved, quantum != 0, Math.abs( quantum ), height,
1024 * 1024, DEFAULT_LOG_INTERVAL ).run();
new PartitionDocumentally( basename + "-int", basename + "-int-part", modulo3, basename + "-strategy", 0, 1024, DEFAULT_PAYLOAD_INDEX, interleaved, quantum != 0, Math.abs( quantum ), height, 1024 * 1024,
DEFAULT_LOG_INTERVAL ).run();
new PartitionDocumentally( basename + "-date", basename + "-date-part", modulo3, basename + "-strategy", 0, 1024, DEFAULT_PAYLOAD_INDEX, interleaved, quantum != 0, Math.abs( quantum ), height,
1024 * 1024, DEFAULT_LOG_INTERVAL ).run();
new PartitionDocumentally( basename + "-virtual", basename + "-virtual-part", modulo3, basename + "-strategy", 0, 1024, flags, interleaved, quantum != 0, Math.abs( quantum ), height,
1024 * 1024, DEFAULT_LOG_INTERVAL ).run();
String[] localIndex = new Properties( basename + "-text-part" + PROPERTIES_EXTENSION ).getStringArray( IndexCluster.PropertyKeys.LOCALINDEX );
for ( String index : localIndex )
BinIO.storeObject( createMap( index + TERMS_EXTENSION ), index + TERMMAP_EXTENSION );
sameContent( basename + "-text", basename + "-text-part", new FileLinesCollection( basename + "-text" + TERMS_EXTENSION, "UTF-8" ).iterator() );
sameContent( basename + "-int", basename + "-int-part" );
sameContent( basename + "-date", basename + "-date-part" );
localIndex = new Properties( basename + "-virtual-part" + PROPERTIES_EXTENSION ).getStringArray( IndexCluster.PropertyKeys.LOCALINDEX );
for ( String index : localIndex )
BinIO.storeObject( createMap( index + TERMS_EXTENSION ), index + TERMMAP_EXTENSION );
sameContent( basename + "-virtual", basename + "-virtual-part", new FileLinesCollection( basename + "-virtual" + TERMS_EXTENSION, "UTF-8" ).iterator() );
localIndex = new Properties( basename + "-text-part" + PROPERTIES_EXTENSION ).getStringArray( IndexCluster.PropertyKeys.LOCALINDEX );
new Merge( basename + "-text-merged", localIndex, false, 1024, flags, interleaved, quantum != 0, quantum, height, 1024 * 1024, DEFAULT_LOG_INTERVAL ).run();
if ( ! interleaved && quantum >= 0 ) sameIndex( basename + "-text", basename + "-text-merged", "batches" );
else sameContent( basename + "-text", basename + "-text-merged" );
localIndex = new Properties( basename + "-int-part" + PROPERTIES_EXTENSION ).getStringArray( IndexCluster.PropertyKeys.LOCALINDEX );
new Merge( basename + "-int-merged", localIndex, false, 1024, DEFAULT_PAYLOAD_INDEX, interleaved, quantum != 0, quantum, height, 1024 * 1024, DEFAULT_LOG_INTERVAL ).run();
if ( ! interleaved && quantum >= 0 ) sameIndex( basename + "-int", basename + "-int-merged", "batches" );
else sameContent( basename + "-int", basename + "-int-merged" );
localIndex = new Properties( basename + "-date-part" + PROPERTIES_EXTENSION ).getStringArray( IndexCluster.PropertyKeys.LOCALINDEX );
new Merge( basename + "-date-merged", localIndex, false, 1024, DEFAULT_PAYLOAD_INDEX, interleaved, quantum != 0, quantum, height, 1024 * 1024, DEFAULT_LOG_INTERVAL ).run();
if ( ! interleaved && quantum >= 0 ) sameIndex( basename + "-date", basename + "-date-merged", "batches" );
else sameContent( basename + "-date", basename + "-date-merged" );
localIndex = new Properties( basename + "-virtual-part" + PROPERTIES_EXTENSION ).getStringArray( IndexCluster.PropertyKeys.LOCALINDEX );
new Merge( basename + "-virtual-merged", localIndex, false, 1024, flags, interleaved, quantum != 0, quantum, height, 1024 * 1024, DEFAULT_LOG_INTERVAL ).run();
if ( ! interleaved && quantum >= 0 ) sameIndex( basename + "-virtual", basename + "-virtual-merged", "batches" );
else sameContent( basename + "-virtual", basename + "-virtual-merged" );
}
public void testPartitionMerge() throws Exception {
final Reference2ObjectOpenHashMap flags = new Reference2ObjectOpenHashMap( defaultStandardIndex() );
flags.remove( Component.POSITIONS );
testPartitionMerge( true, flags, 4, 4 );
testPartitionMerge( true, flags, -4, 4 );
flags.remove( Component.COUNTS );
testPartitionMerge( true, flags, 4, 4 );
testPartitionMerge( true, flags, -4, 4 );
testPartitionMerge( true, defaultStandardIndex(), 0, 0 );
testPartitionMerge( true, defaultStandardIndex(), 1, 1 );
testPartitionMerge( true, defaultStandardIndex(), 1, 2 );
testPartitionMerge( true, defaultStandardIndex(), 4, 1 );
testPartitionMerge( true, defaultStandardIndex(), 4, 4 );
testPartitionMerge( true, defaultStandardIndex(), 8, 1 );
testPartitionMerge( true, defaultStandardIndex(), 8, 4 );
testPartitionMerge( true, defaultStandardIndex(), -1, 1 );
testPartitionMerge( true, defaultStandardIndex(), -1, 2 );
testPartitionMerge( true, defaultStandardIndex(), -4, 1 );
testPartitionMerge( true, defaultStandardIndex(), -4, 4 );
testPartitionMerge( true, defaultStandardIndex(), -8, 1 );
testPartitionMerge( true, defaultStandardIndex(), -8, 4 );
testPartitionMerge( false, defaultStandardIndex(), 1, 0 );
testPartitionMerge( false, defaultStandardIndex(), 1, 1 );
testPartitionMerge( false, defaultStandardIndex(), 1, 2 );
testPartitionMerge( false, defaultStandardIndex(), 4, 1 );
testPartitionMerge( false, defaultStandardIndex(), 4, 4 );
testPartitionMerge( false, defaultStandardIndex(), 8, 1 );
testPartitionMerge( false, defaultStandardIndex(), 8, 4 );
testPartitionMerge( false, defaultStandardIndex(), -1, 1 );
testPartitionMerge( false, defaultStandardIndex(), -1, 2 );
testPartitionMerge( false, defaultStandardIndex(), -4, 1 );
testPartitionMerge( false, defaultStandardIndex(), -4, 4 );
testPartitionMerge( false, defaultStandardIndex(), -8, 1 );
testPartitionMerge( false, defaultStandardIndex(), -8, 4 );
}
public void testLexicalPartitioning( boolean interleaved, Map flags ) throws ConfigurationException, SecurityException, IOException, ClassNotFoundException, InstantiationException, IllegalAccessException,
Exception {
// Vanilla indexing
new IndexBuilder( basename, getSequence() ).standardWriterFlags( flags ).interleaved( interleaved ).skipBufferSize( 1024 ).pasteBufferSize( 1024 ).virtualDocumentResolver( 3, RESOLVER ).run();
// Now we use a crazy strategy moving around documents using modular arithmetic
final LexicalPartitioningStrategy uniform = LexicalStrategies.uniform( 3, DiskBasedIndex.getInstance( basename + "-text" ) );
BinIO.storeObject( uniform, basename + "-strategy" );
new PartitionLexically( basename + "-text", basename + "-text-part", uniform, basename + "-strategy", 1024, DEFAULT_LOG_INTERVAL ).run();
new PartitionLexically( basename + "-virtual", basename + "-virtual-part", uniform, basename + "-strategy", 1024, DEFAULT_LOG_INTERVAL ).run();
String[] localIndex = new Properties( basename + "-text-part" + PROPERTIES_EXTENSION ).getStringArray( IndexCluster.PropertyKeys.LOCALINDEX );
for ( String index : localIndex )
BinIO.storeObject( createMap( index + TERMS_EXTENSION ), index + TERMMAP_EXTENSION );
sameContent( basename + "-text", basename + "-text-part", new FileLinesCollection( basename + "-text" + TERMS_EXTENSION, "UTF-8" ).iterator() );
sameContent( basename + "-virtual", basename + "-virtual-part" );
}
public void testLexicalPartitioning() throws ConfigurationException, SecurityException, IOException, ClassNotFoundException, InstantiationException, IllegalAccessException, Exception {
testLexicalPartitioning( true, defaultStandardIndex() );
testLexicalPartitioning( false, defaultStandardIndex() );
Reference2ObjectOpenHashMap flags = new Reference2ObjectOpenHashMap( defaultStandardIndex() );
flags.remove( Component.POSITIONS );
testLexicalPartitioning( true, flags );
flags.remove( Component.COUNTS );
testLexicalPartitioning( true, flags );
}
public void testEmpty( boolean interleaved, Map flags, int quantum, int height ) throws ConfigurationException, SecurityException, IOException, URISyntaxException, ClassNotFoundException, InstantiationException,
IllegalAccessException, InvocationTargetException, NoSuchMethodException {
// Vanilla indexing
new IndexBuilder( basename, getEmptySequence() ).standardWriterFlags( flags ).skipBufferSize( 1024 ).pasteBufferSize( 1024 ).interleaved( interleaved ).skips( quantum != 0 ).quantum( quantum ).height( height )
.virtualDocumentResolver( 3, RESOLVER ).documentsPerBatch( 20 ).run();
checkAgainstContent( getEmptySequence(), null, RESOLVER, Scan.DEFAULT_VIRTUAL_DOCUMENT_GAP, Index.getInstance( basename + "-text" ), Index.getInstance( basename + "-int" ), Index
.getInstance( basename + "-date" ), Index.getInstance( basename + "-virtual" ) );
// Permuted indexing
String mapFile = File.createTempFile( IndexSlowTest.class.getSimpleName(), "permutation" ).toString();
new IndexBuilder( basename + "-mapped", getEmptySequence() ).standardWriterFlags( flags ).skipBufferSize( 1024 ).pasteBufferSize( 1024 ).interleaved( interleaved ).skips( quantum != 0 ).quantum( quantum ).height( height )
.virtualDocumentResolver( 3, RESOLVER ).mapFile( mapFile ).documentsPerBatch( 20 ).run();
sameIndex( basename + "-text", basename + "-mapped-text" );
sameIndex( basename + "-int", basename + "-mapped-int" );
sameIndex( basename + "-date", basename + "-mapped-date" );
sameIndex( basename + "-virtual", basename + "-mapped-virtual" );
}
public void testEmpty() throws Exception {
final Reference2ObjectOpenHashMap flags = new Reference2ObjectOpenHashMap( defaultStandardIndex() );
flags.remove( Component.POSITIONS );
testEmpty( true, flags, 4, 4 );
testEmpty( true, flags, -4, 4 );
flags.remove( Component.COUNTS );
testEmpty( true, flags, 4, 4 );
testEmpty( true, flags, -4, 4 );
testEmpty( true, defaultStandardIndex(), 0, 0 );
testEmpty( true, defaultStandardIndex(), 1, 1 );
testEmpty( true, defaultStandardIndex(), 1, 2 );
testEmpty( true, defaultStandardIndex(), 4, 1 );
testEmpty( true, defaultStandardIndex(), 4, 4 );
testEmpty( true, defaultStandardIndex(), 8, 1 );
testEmpty( true, defaultStandardIndex(), 8, 4 );
testEmpty( true, defaultStandardIndex(), -1, 1 );
testEmpty( true, defaultStandardIndex(), -1, 2 );
testEmpty( true, defaultStandardIndex(), -8, 1 );
testEmpty( true, defaultStandardIndex(), -8, 4 );
testEmpty( true, defaultStandardIndex(), -8, 1 );
testEmpty( true, defaultStandardIndex(), -8, 4 );
testEmpty( false, defaultStandardIndex(), 1, 0 );
testEmpty( false, defaultStandardIndex(), 1, 1 );
testEmpty( false, defaultStandardIndex(), 1, 2 );
testEmpty( false, defaultStandardIndex(), 4, 1 );
testEmpty( false, defaultStandardIndex(), 4, 4 );
testEmpty( false, defaultStandardIndex(), 8, 1 );
testEmpty( false, defaultStandardIndex(), 8, 4 );
testEmpty( false, defaultStandardIndex(), -1, 1 );
testEmpty( false, defaultStandardIndex(), -1, 2 );
testEmpty( false, defaultStandardIndex(), -8, 1 );
testEmpty( false, defaultStandardIndex(), -8, 4 );
testEmpty( false, defaultStandardIndex(), -8, 1 );
testEmpty( false, defaultStandardIndex(), -8, 4 );
}
*/
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy