![JAR search and dependency download from the Maven repository](/logo.png)
test.it.unimi.dsi.big.mg4j.document.DocumentCollectionTest Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of mg4j-big Show documentation
Show all versions of mg4j-big Show documentation
MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections written in Java. The big version is a fork of the original MG4J that can handle more than 2^31 terms and documents.
The newest version!
package it.unimi.dsi.big.mg4j.document;
/*
* MG4J: Managing Gigabytes for Java (big)
*
* Copyright (C) 2005-2011 Paolo Boldi
*
* This library is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by the Free
* Software Foundation; either version 3 of the License, or (at your option)
* any later version.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program; if not, see .
*
*/
import static org.junit.Assert.*;
import it.unimi.dsi.fastutil.Arrays;
import it.unimi.dsi.fastutil.Swapper;
import it.unimi.dsi.fastutil.ints.AbstractIntComparator;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.longs.LongSet;
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
import it.unimi.dsi.fastutil.longs.LongIterators;
import it.unimi.dsi.io.WordReader;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.util.Properties;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.Reader;
import java.io.Writer;
import java.util.StringTokenizer;
import org.apache.commons.configuration.ConfigurationException;
import org.apache.commons.io.FileUtils;
import org.junit.BeforeClass;
import org.junit.Test;
public class DocumentCollectionTest {
/* We consider documents abstractly described by two fields each.
*
* WARNING: the first string MUST be a prefix of the second string. */
private final static String[][] document = new String[][] {
// 0 1 2 3 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
new String[] { "xxx yyy zzz xxx", "xxx yyy zzz xxx aaa xxx aaa yyy aaa yyy aaa zzz aaa www aaa" },
new String[] { "aaa xxx aaa aaa", "aaa xxx aaa aaa xxx aaa zzz uuu" },
new String[] { "aaa uuu aaa" , "aaa uuu aaa xxx xxx xxx aaa xxx" },
// This tests that zipped collections handle properly initial spaces and
// that word readers are propagated correctly.
new String[] { " aaa uuu aaa" , " aaa uuu aaa _ __ xxx _ xxx xxx aaa xxx" },
};
private final static String[][] document2 = new String[][] {
// 0 1 2 3 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
new String[] { "xxx yyy zzz xxx", "xxx yyy zzz xxx aaa xxx aaa yyy aaa yyy aaa zzz aaa www aaa" },
new String[] { "aaa xxx aaa aaa", "aaa xxx aaa aaa xxx aaa zzz uuu" },
new String[] { "aaa uuu aaa" , "aaa uuu aaa xxx xxx xxx aaa xxx" },
// This tests that zipped collections handle properly initial spaces and
// that word readers are propagated correctly.
new String[] { " aaa uuu aaa" , " aaa uuu aaa _ __ xxx _ xxx xxx aaa xxx" },
new String[] { "xxx yyy zzz xxx", "xxx yyy zzz xxx aaa xxx aaa yyy aaa yyy aaa zzz aaa www aaa" },
new String[] { "aaa xxx aaa aaa", "aaa xxx aaa aaa xxx aaa zzz uuu" },
new String[] { "aaa uuu aaa" , "aaa uuu aaa xxx xxx xxx aaa xxx" },
// This tests that zipped collections handle properly initial spaces and
// that word readers are propagated correctly.
new String[] { " aaa uuu aaa" , " aaa uuu aaa _ __ xxx _ xxx xxx aaa xxx" },
};
private final static Properties DEFAULT_PROPERTIES = new Properties();
static {
DEFAULT_PROPERTIES.setProperty( PropertyBasedDocumentFactory.MetadataKeys.ENCODING, "ASCII" );
DEFAULT_PROPERTIES.setProperty( PropertyBasedDocumentFactory.MetadataKeys.WORDREADER, it.unimi.dsi.io.FastBufferedReader.class.getName() + "(_)" );
}
/** The number of documents. */
private final static int ndoc = document.length;
/** The temporary directory where all tests are run. */
private static File tempDir;
/** The set of files in the HTML directory. */
private static String[] htmlFileSet;
/** Given a two-field document, produce an HTML document with the first field as title and
* the second field as body.
*
* @param document the document.
* @return the HTML version of the document.
*/
private static String getHTMLDocument( String[] document ) {
MutableString res = new MutableString();
res.append( "\n" );
res.append( "\n\n" + document[ 0 ] + " \n" );
// Do NOT append the first part of the body
res.append( "\n" + document[ 1 ].substring( document[ 0 ].length() ) );
res.append( "\n\n" );
res.append( "" );
return res.toString();
}
/** Given a two-field document, produce a mbox document with the first field as subject and
* the second field as body.
*
* @param document the document.
* @return the HTML version of the document.
*/
private static String getMboxDocument( String[] document ) {
MutableString res = new MutableString();
res.append( "From MAILER-DAEMON Fri Apr 15 16:22:32 2005\n" );
res.append( "Date: 15 Apr 2005 16:22:32 +0200\n" );
res.append( "From: Mail System Internal Data \n" );
res.append( "Subject: " + document[ 0 ] + "\n" );
res.append( "Message-ID: <[email protected]>\n" );
res.append( "X-IMAP: 1102967122 0000138458\n" );
res.append( "Return-Path: \n" );
res.append( "Received: from localhost (localhost.localdomain [127.0.0.1])\n" );
res.append( "\tby sliver.usr.dsi.unimi.it (8.12.11/8.12.11) with ESMTP id iAUNtadn007305\n");
res.append( "\tfor ; Wed, 1 Dec 2004 00:55:36 +0100\n" );
res.append( "Received: from law5.usr.dsi.unimi.it [159.149.146.241]\n" );
res.append( "\tby localhost with IMAP (fetchmail-6.2.5)\n" );
res.append( "\tfor vigna@localhost (single-drop); Wed, 01 Dec 2004 00:55:36 +0100 (CET)\n" );
res.append( "To: [email protected]\n" );
res.append( "Message-id: \n" );
res.append( "Content-type: TEXT/PLAIN; charset=iso-8859-15\n" );
res.append( "X-Warning: UNAuthenticated Sender\n" );
res.append( "Content-Transfer-Encoding: 8bit\n" );
res.append( "Content-Length: " + document[ 1 ].length() + "\n" );
res.append( "\n" );
res.append( document[ 1 ] + "\n" );
return res.toString();
}
/** Checks that the tokenizer and the word reader return exactly the same sequence of words.
*
* @param wordReader the word reader.
* @param tok the tokenizer.
* @throws IOException
*/
private void checkSameWords( WordReader wordReader, StringTokenizer tok ) throws IOException {
MutableString word = new MutableString();
MutableString nonWord = new MutableString();
boolean aWordInDocum, aWordInDocument;
boolean firstTime = true;
for (;;) {
aWordInDocum = wordReader.next( word, nonWord );
if ( firstTime ) {
firstTime = false;
if ( word.equals( "" ) ) continue;
}
assertFalse( aWordInDocum && word.equals( "" ) );
aWordInDocument = tok.hasMoreElements();
assertTrue( aWordInDocum == aWordInDocument );
if ( !aWordInDocum ) break;
assertEquals( tok.nextElement(), word.toString() );
}
}
/** Checks that the documents in the collection have the same sequence of words as in
* document: the names of the fields to be checked are specified in the array.
*
* @param coll the collection.
* @param fieldName the field names.
* @param document documents to be checked against.
* @throws IOException
*/
private void checkAllDocuments( final DocumentCollection coll, final String[] fieldName, final String[][] document ) throws IOException {
final int nfields = fieldName.length;
final int[] fieldNumber = new int[ nfields ];
final int[] arrayIndex = new int[ nfields ];
// Look for field indices
for ( int i = 0; i < nfields; i++ ) {
arrayIndex[ i ] = i;
int j;
for ( j = 0; j < coll.factory().numberOfFields(); j++ )
if ( coll.factory().fieldName( j ).equals( fieldName[ i ] ) ) {
fieldNumber[ i ] = j;
break;
}
assert j < coll.factory().numberOfFields();
}
// Sort fields to guarantee that they are correctly numbered
Arrays.quickSort( 0, nfields, new AbstractIntComparator() {
public int compare( int x, int y ) {
return fieldNumber[ x ] - fieldNumber[ y ];
}}, new Swapper() {
public void swap( int x, int y ) {
int t = fieldNumber[ x ]; fieldNumber[ x ] = fieldNumber[ y ]; fieldNumber[ y ] = t;
t = arrayIndex[ x ]; arrayIndex[ x ] = arrayIndex[ y ]; arrayIndex[ y ] = t;
String q = fieldName[ x ]; fieldName[ x ] = fieldName[ y ]; fieldName[ y ] = q;
}} );
// Start checking
for ( int doc = 0; doc < coll.size(); doc++ ) {
Document docum = coll.document( doc );
for ( int i = 0; i < nfields; i++ ) {
int field = fieldNumber[ i ];
Reader content = (Reader)docum.content( field );
WordReader wordReader = docum.wordReader( field );
wordReader.setReader( content );
StringTokenizer tok = new StringTokenizer( document[ doc ][ arrayIndex[ i ] ] );
System.err.println( "Checking document " + doc + " field " + fieldName[ i ] + " (" + field + ")" );
checkSameWords( wordReader, tok );
}
docum.close();
}
}
/** Checks that the documents in the sequence have the same sequence of words as in
* document
: the names of the fields to be checked are specified in the array.
*
* @param seq the sequence.
* @param fieldName the field names.
* @param document documents to be checked against.
* @throws IOException
*/
private void checkAllDocumentsSeq( final DocumentSequence seq, final String[] fieldName, final String[][] document ) throws IOException {
final int nfields = fieldName.length;
final int[] fieldNumber = new int[ nfields ];
final int[] arrayIndex = new int[ nfields ];
// Look for field indices
for ( int i = 0; i < nfields; i++ ) {
arrayIndex[ i ] = i;
int j;
for ( j = 0; j < seq.factory().numberOfFields(); j++ )
if ( seq.factory().fieldName( j ).equals( fieldName[ i ] ) ) {
fieldNumber[ i ] = j;
break;
}
assert j < seq.factory().numberOfFields();
}
// Sort fields to guarantee that they are correctly numbered
Arrays.quickSort( 0, nfields, new AbstractIntComparator() {
public int compare( int x, int y ) {
return fieldNumber[ x ] - fieldNumber[ y ];
}}, new Swapper() {
public void swap( int x, int y ) {
int t = fieldNumber[ x ]; fieldNumber[ x ] = fieldNumber[ y ]; fieldNumber[ y ] = t;
t = arrayIndex[ x ]; arrayIndex[ x ] = arrayIndex[ y ]; arrayIndex[ y ] = t;
String q = fieldName[ x ]; fieldName[ x ] = fieldName[ y ]; fieldName[ y ] = q;
}} );
// Start checking
DocumentIterator iterator = seq.iterator();
Document docum;
int doc = 0;
while ( ( docum = iterator.nextDocument() ) != null ) {
for ( int i = 0; i < nfields; i++ ) {
int field = fieldNumber[ i ];
Reader content = (Reader)docum.content( field );
WordReader wordReader = docum.wordReader( field );
wordReader.setReader( content );
StringTokenizer tok = new StringTokenizer( document[ doc ][ arrayIndex[ i ] ] );
System.err.println( "Checking sequentially document " + doc + " field " + fieldName[ i ] + " (" + field + ")" );
checkSameWords( wordReader, tok );
}
docum.close();
doc++;
}
iterator.close();
}
@BeforeClass
public static void setUp() throws IOException, ConfigurationException {
// Create a new directory under /tmp
tempDir = File.createTempFile( "mg4jtest", null );
tempDir.delete();
tempDir.mkdir();
// Now create the hierarchy for HTML files
File htmlDir = new File( tempDir, "html" );
htmlDir.mkdir();
System.err.println( "Temporary directory: " + tempDir );
htmlFileSet = new String[ ndoc ];
for ( int i = 0; i < ndoc; i++ ) {
String docFile = new File( htmlDir, "doc" + i + ".html" ).toString();
htmlFileSet[ i ] = docFile;
Writer docWriter = new OutputStreamWriter( new FileOutputStream( docFile ), "ISO-8859-1" );
docWriter.write( getHTMLDocument( document[ i ] ) );
docWriter.close();
}
// Now create the mbox file
Writer mboxWriter = new OutputStreamWriter( new FileOutputStream( new File( tempDir, "mbox" ) ), "ISO-8859-1" );
for ( int i = 0; i < ndoc; i++ )
mboxWriter.write( getMboxDocument( document[ i ] ) );
mboxWriter.close();
// Now create the zip collections
FileSetDocumentCollection fileSetDocumentCollection = new FileSetDocumentCollection( htmlFileSet, new HtmlDocumentFactory( DEFAULT_PROPERTIES ) );
ZipDocumentCollectionBuilder zipCollBuilder = new ZipDocumentCollectionBuilder( new File( tempDir, "zip" ).toString(),
fileSetDocumentCollection.factory(), true );
zipCollBuilder.build( fileSetDocumentCollection );
ZipDocumentCollectionBuilder apprZipCollBuilder = new ZipDocumentCollectionBuilder( new File( tempDir, "azip" ).toString(),
fileSetDocumentCollection.factory(), false );
apprZipCollBuilder.build( fileSetDocumentCollection );
fileSetDocumentCollection.close();
// Now create the simple collections
SimpleCompressedDocumentCollectionBuilder simpleCollBuilder = new SimpleCompressedDocumentCollectionBuilder( new File( tempDir, "simple" ).toString(),
fileSetDocumentCollection.factory(), true );
simpleCollBuilder.build( fileSetDocumentCollection );
SimpleCompressedDocumentCollectionBuilder apprSimpleCollBuilder = new SimpleCompressedDocumentCollectionBuilder( new File( tempDir, "asimple" ).toString(),
fileSetDocumentCollection.factory(), false );
apprSimpleCollBuilder.build( fileSetDocumentCollection );
fileSetDocumentCollection.close();
}
protected void tearDown() throws IOException {
FileUtils.forceDelete( tempDir );
}
@Test
public void testFileSetDocumentCollection() throws IOException, ConfigurationException {
System.err.println( "Checking fileset collection" );
FileSetDocumentCollection coll = new FileSetDocumentCollection( htmlFileSet, new HtmlDocumentFactory( DEFAULT_PROPERTIES ) );
assertEquals( coll.size(), ndoc );
checkAllDocuments( coll, new String[] { "title", "text" }, document );
coll.close();
}
@Test
public void testFileSetDocumentCollectionSeq() throws IOException, ConfigurationException {
System.err.println( "Checking fileset collection sequentially" );
FileSetDocumentCollection coll = new FileSetDocumentCollection( htmlFileSet, new HtmlDocumentFactory( DEFAULT_PROPERTIES ) );
checkAllDocumentsSeq( coll, new String[] { "title", "text" }, document );
coll.close();
}
@Test
public void testZipDocumentCollection() throws IOException, ClassNotFoundException {
System.err.println( "Checking zipped collection" );
ZipDocumentCollection coll = (ZipDocumentCollection)BinIO.loadObject( new File( tempDir, "zip.collection" ).toString() );
checkAllDocuments( coll, new String[] { "title", "text" }, document );
coll.close();
}
@Test
public void testZipDocumentCollectionSeq() throws IOException, ClassNotFoundException {
System.err.println( "Checking zipped collection sequentially" );
ZipDocumentCollection coll = (ZipDocumentCollection)BinIO.loadObject( new File( tempDir, "zip.collection" ).toString() );
checkAllDocumentsSeq( coll, new String[] { "title", "text" }, document );
coll.close();
}
@Test
public void testZipDocumentCollectionAppr() throws IOException, ClassNotFoundException {
System.err.println( "Checking approximated zipped collection" );
ZipDocumentCollection coll = (ZipDocumentCollection)BinIO.loadObject( new File( tempDir, "azip.collection" ).toString() );
checkAllDocuments( coll, new String[] { "title", "text" }, document );
coll.close();
}
@Test
public void testZipDocumentCollectionApprSeq() throws IOException, ClassNotFoundException {
System.err.println( "Checking approximated zipped collection sequentially" );
ZipDocumentCollection coll = (ZipDocumentCollection)BinIO.loadObject( new File( tempDir, "azip.collection" ).toString() );
checkAllDocumentsSeq( coll, new String[] { "title", "text" }, document );
coll.close();
}
@Test
public void testSimpleCompressedDocumentCollection() throws IOException, ClassNotFoundException {
System.err.println( "Checking simple compressed collection" );
SimpleCompressedDocumentCollection coll = (SimpleCompressedDocumentCollection)BinIO.loadObject( new File( tempDir, "simple.collection" ).toString() );
checkAllDocuments( coll, new String[] { "title", "text" }, document );
coll.close();
}
@Test
public void testSimpleCompressedDocumentCollectionSeq() throws IOException, ClassNotFoundException {
System.err.println( "Checking simple compressed collection sequentially" );
SimpleCompressedDocumentCollection coll = (SimpleCompressedDocumentCollection)BinIO.loadObject( new File( tempDir, "simple.collection" ).toString() );
checkAllDocumentsSeq( coll, new String[] { "title", "text" }, document );
coll.close();
}
@Test
public void testSimpleCompressedDocumentCollectionAppr() throws IOException, ClassNotFoundException {
System.err.println( "Checking approximated simple compressed collection" );
SimpleCompressedDocumentCollection coll = (SimpleCompressedDocumentCollection)BinIO.loadObject( new File( tempDir, "asimple.collection" ).toString() );
checkAllDocuments( coll, new String[] { "title", "text" }, document );
coll.close();
}
@Test
public void testSimpleCompressedDocumentCollectionApprSeq() throws IOException, ClassNotFoundException {
System.err.println( "Checking approximated simple compressed collection sequentially" );
SimpleCompressedDocumentCollection coll = (SimpleCompressedDocumentCollection)BinIO.loadObject( new File( tempDir, "asimple.collection" ).toString() );
checkAllDocumentsSeq( coll, new String[] { "title", "text" }, document );
coll.close();
}
@Test
public void testConcatenated() throws IOException, ClassNotFoundException {
SimpleCompressedDocumentCollection coll0 = (SimpleCompressedDocumentCollection)BinIO.loadObject( new File( tempDir, "asimple.collection" ).toString() );
SimpleCompressedDocumentCollection coll1 = (SimpleCompressedDocumentCollection)BinIO.loadObject( new File( tempDir, "asimple.collection" ).toString() );
ConcatenatedDocumentCollection concatenatedDocumentCollection = new ConcatenatedDocumentCollection( new String[] { new File( tempDir, "asimple.collection" ).toString(), new File( tempDir, "asimple.collection" ).toString() } );
ConcatenatedDocumentSequence concatenatedDocumentSequence0 = new ConcatenatedDocumentSequence( coll0, coll1 );
ConcatenatedDocumentSequence concatenatedDocumentSequence1 = new ConcatenatedDocumentSequence( new File( tempDir, "asimple.collection" ).toString(), new File( tempDir, "asimple.collection" ).toString() );
checkAllDocumentsSeq( concatenatedDocumentSequence0, new String[] { "title", "text" }, document2 );
checkAllDocumentsSeq( concatenatedDocumentSequence1, new String[] { "title", "text" }, document2 );
checkAllDocuments( concatenatedDocumentCollection, new String[] { "title", "text" }, document2 );
concatenatedDocumentCollection.close();
concatenatedDocumentSequence0.close();
concatenatedDocumentSequence0.close();
}
@Test
public void testInputStreamSequence() throws IOException, ConfigurationException {
System.err.println( "Checking input stream (text field only)" );
// Extract only field number 1, and write it out with separator '\u0000'
MutableString res = new MutableString();
String[][] justSecondField = new String[ ndoc ][ 1 ];
for ( int i = 0; i < ndoc; i++ ) {
res.append( document[ i ][ 1 ] + "\u0000" );
justSecondField[ i ][ 0 ] = document[ i ][ 1 ];
}
String resString = res.toString();
// Write the sequence on a file (in UTF-8)
Writer resWriter = new OutputStreamWriter( new FileOutputStream( new File( tempDir, "stream" ) ), "UTF-8" );
resWriter.write( resString );
resWriter.close();
// Read it as a input stream document sequence
InputStream is = new FileInputStream( new File( tempDir, "stream" ) );
DocumentSequence seq = new InputStreamDocumentSequence( is, '\u0000', new IdentityDocumentFactory( DEFAULT_PROPERTIES ) );
checkAllDocumentsSeq( seq, new String[] { "text" }, justSecondField );
seq.close();
}
@Test
public void testSubsetDocumentSequence() throws IOException, ClassNotFoundException {
// All documents
DocumentSequence seq = (SimpleCompressedDocumentCollection)BinIO.loadObject( new File( tempDir, "asimple.collection" ).toString() );
LongSet allDocuments = new LongOpenHashSet( LongIterators.fromTo( 0, document.length ) );
SubsetDocumentSequence trivialSubsetDocumentSequence = new SubsetDocumentSequence( seq, allDocuments );
checkAllDocumentsSeq( trivialSubsetDocumentSequence, new String[] { "title", "text" }, document );
seq.close();
trivialSubsetDocumentSequence.close();
// Even documents only
seq = (SimpleCompressedDocumentCollection)BinIO.loadObject( new File( tempDir, "asimple.collection" ).toString() );
String[][] evenDocuments = new String[ document.length / 2 ][];
LongSet evenDocumentPointers = new LongOpenHashSet();
for ( int i = 0; i < evenDocuments.length; i++ ) {
evenDocuments[ i ] = document[ 2 * i ];
evenDocumentPointers.add( 2 * i );
}
SubsetDocumentSequence evenSubsetDocumentSequence = new SubsetDocumentSequence( seq, evenDocumentPointers );
checkAllDocumentsSeq( evenSubsetDocumentSequence, new String[] { "title", "text" }, evenDocuments );
seq.close();
evenSubsetDocumentSequence.close();
// All but number 3
seq = (SimpleCompressedDocumentCollection)BinIO.loadObject( new File( tempDir, "asimple.collection" ).toString() );
String[][] almostAll = new String[ document.length - 1 ][];
LongSet almostAllDocumentPointers = new LongOpenHashSet();
for ( int i = 0, j = 0; i < document.length; i++ )
if ( i != 3 ) {
almostAll[ j ] = document[ i ];
almostAllDocumentPointers.add( j );
j++;
}
SubsetDocumentSequence almostAllDocumentSequence = new SubsetDocumentSequence( seq, almostAllDocumentPointers );
checkAllDocumentsSeq( almostAllDocumentSequence, new String[] { "title", "text" }, almostAll );
seq.close();
almostAllDocumentSequence.close();
// None
seq = (SimpleCompressedDocumentCollection)BinIO.loadObject( new File( tempDir, "asimple.collection" ).toString() );
String[][] none = new String[ 0 ][];
LongSet noneDocumentPointers = new LongOpenHashSet();
SubsetDocumentSequence noneDocumentSequence = new SubsetDocumentSequence( seq, noneDocumentPointers );
checkAllDocumentsSeq( noneDocumentSequence, new String[] { "title", "text" }, none );
seq.close();
noneDocumentSequence.close();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy