All Downloads are FREE. Search and download functionalities are using the official Maven repository.

src.it.unimi.dsi.big.mg4j.document.ZipDocumentCollection Maven / Gradle / Ivy

Go to download

MG4J (Managing Gigabytes for Java) is a free full-text search engine for large document collections written in Java. The big version is a fork of the original MG4J that can handle more than 2^31 terms and documents.

The newest version!
package it.unimi.dsi.big.mg4j.document;

/*		 
 * MG4J: Managing Gigabytes for Java (big)
 *
 * Copyright (C) 2005-2011 Paolo Boldi  
 *
 *  This library is free software; you can redistribute it and/or modify it
 *  under the terms of the GNU Lesser General Public License as published by the Free
 *  Software Foundation; either version 3 of the License, or (at your option)
 *  any later version.
 *
 *  This library is distributed in the hope that it will be useful, but
 *  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
 *  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License
 *  for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program; if not, see .
 *
 */

import it.unimi.dsi.Util;
import it.unimi.dsi.fastutil.objects.ObjectArrayList;
import it.unimi.dsi.fastutil.objects.Reference2ObjectArrayMap;
import it.unimi.dsi.fastutil.objects.Reference2ObjectMap;
import it.unimi.dsi.io.FastBufferedReader;
import it.unimi.dsi.io.WordReader;
import it.unimi.dsi.lang.MutableString;
import it.unimi.dsi.big.mg4j.document.PropertyBasedDocumentFactory.MetadataKeys;
import it.unimi.dsi.big.mg4j.tool.Scan.VirtualDocumentFragment;
import it.unimi.dsi.big.mg4j.util.parser.callback.AnchorExtractor;

import java.io.DataInputStream;
import java.io.EOFException;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.io.Reader;
import java.io.Serializable;
import java.util.NoSuchElementException;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
import java.util.zip.ZipInputStream;

import org.apache.log4j.Logger;

/** A {@linkplain it.unimi.dsi.big.mg4j.document.DocumentCollection document collection} stored in a {@linkplain ZipFile zip file}.
 * 
 * 

Each instance of this class has an associated zip file. Each Zip entry corresponds to a document: * the title is recorded in the comment field, whereas the * URI is written with {@link MutableString#writeSelfDelimUTF8(java.io.OutputStream)} * directly to the zipped output stream. When building an exact * {@linkplain it.unimi.dsi.big.mg4j.document.ZipDocumentCollection} * subsequent word/nonword pairs are written in the same way, and * delimited by two empty strings. If the collection is not exact, just words are written, * and delimited by an empty string. Non-text fields are written directly to the zipped output stream * as serialised objects. * *

The collection will produce the same documents as the original sequence whence it * was produced, in the following sense: * *

    *
  • the resulting collection has as many document as the original sequence, in the same order, with * the same titles and URI; *
  • every document has the same number of fields, with the same names and types; *
  • non-textual non-virtual fields will be written out as objects, so they need to be serializable; *
  • virtual fields will be written as a sequence of {@linkplain MutableString#writeSelfDelimUTF8(java.io.DataOutput) self-delimiting UTF-8 mutable strings} * starting with the number of fragments (converted into a string with {@link String#valueOf(int)}), * followed by a pair of strings for each fragment (the first string being the document specifier, * and the second being the associated text); *
  • textual fields will be written out in such a way that, when reading them, the same sequence * of words and non-words will be produced; alternatively, one may produce a collection that only * copies words (non-words are not copied). *
* *

The collection will be, as any other collection, serialized on a file, but it will refer to another * zip file that is going to contain the documents themselves. Please use {@link AbstractDocumentSequence#load(CharSequence)} * to load instances of this collection. * *

Note that the zip format is not designed for a large number of files. This class is mainly a useful example, * and a handy way to build quickly a collection containing all fields at indexing time. For a more efficient * kind of collection, see {@link SimpleCompressedDocumentCollection}. * *

Warning: the {@link java.io.Reader} returned by {@link it.unimi.dsi.big.mg4j.document.Document#content(int)} * for documents produced by this factory is just obtained as the concatenation of words and non-words returned by * the word reader for that field. In case the collection is not exact, nonwords are substituted by a space. */ public class ZipDocumentCollection extends AbstractDocumentCollection implements Serializable { private static final long serialVersionUID = 2L; public final static String ZIP_EXTENSION = ".zip"; /** Symbolic names for common properties of a {@link it.unimi.dsi.big.mg4j.document.DocumentCollection}. */ public static enum PropertyKeys { /** The serialised collection. */ COLLECTION, } private static final Logger LOGGER = Util.getLogger( ZipDocumentCollection.class ); private static final boolean DEBUG = false; /** The name of the zip collection file. */ private String zipFilename; /** The zip collection file. */ private transient ZipFile zipFile; /** The factory used for the original document sequence. */ private final DocumentFactory underlyingFactory; /** The factory used for this document collection. */ private final DocumentFactory factory; /** The number of documents. */ private final long numberOfDocuments; /** true iff this is an exact reproduction of the original sequence (i.e., if also non-words are preserved). */ private final boolean exact; /** A factory tightly coupled to a {@link ZipDocumentCollection}. */ protected static class ZipFactory extends AbstractDocumentFactory { private static final long serialVersionUID = 1L; private final boolean exact; private final DocumentFactory underlyingFactory; protected ZipFactory( final DocumentFactory underlyingFactory, final boolean exact ) { this.underlyingFactory = underlyingFactory; this.exact = exact; } public ZipFactory copy() { return this; } public int numberOfFields() { return underlyingFactory.numberOfFields(); } public String fieldName( final int field ) { ensureFieldIndex( field ); return underlyingFactory.fieldName( field ); } public int fieldIndex( final String fieldName ) { return underlyingFactory.fieldIndex( fieldName ); } public FieldType fieldType( final int field ) { ensureFieldIndex( field ); return underlyingFactory.fieldType( field ); } public Document getDocument( final InputStream rawContent, final Reference2ObjectMap,Object> metadata ) throws IOException { return new AbstractDocument() { final DataInputStream rawContentDataInputStream = new DataInputStream( rawContent ); int nextFieldToRead = 0; final MutableString uri = new MutableString(); { uri.readSelfDelimUTF8( rawContent ).compact(); } @Override public void close() throws IOException { super.close(); rawContent.close(); } public CharSequence title() { return (CharSequence)metadata.get( MetadataKeys.TITLE ); } public String toString() { return title().toString(); } public CharSequence uri() { return uri.length() == 0 ? null : uri; } /** Skips until the end of the current field, and increments nextFieldToRead. * @throws ClassNotFoundException * @throws IOException */ private void skipOneField() throws IOException, ClassNotFoundException { switch( fieldType( nextFieldToRead ) ) { case TEXT: MutableString word = new MutableString(); MutableString nonWord = new MutableString(); do { word.readSelfDelimUTF8( rawContent ); if ( exact ) nonWord.readSelfDelimUTF8( rawContent ); } while ( word.length() > 0 || ( exact && nonWord.length() > 0 ) ); break; case VIRTUAL: final int nfrag = rawContentDataInputStream.readInt(); for ( int i = 0; i < 2 * nfrag; i++ ) MutableString.skipSelfDelimUTF8( rawContent ); break; default: // Non-text and non-virtual new ObjectInputStream( rawContent ).readObject(); } nextFieldToRead++; } /** Skips to the given field. * * @param field the field to skip to. * @throws IOException * @throws ClassNotFoundException */ private void skipToField( final int field ) throws IOException, ClassNotFoundException { if ( nextFieldToRead > field ) throw new IllegalStateException( "Trying to skip to field " + field + " after " + nextFieldToRead ); while ( nextFieldToRead < field ) skipOneField(); } public Object content( final int field ) { ensureFieldIndex( field ); Object result = null; if ( DEBUG ) LOGGER.debug( "Called content(" + field + "); nextField:" + nextFieldToRead ); try { skipToField( field ); if ( fieldType( nextFieldToRead ) == FieldType.VIRTUAL ) { final int nfrag = rawContentDataInputStream.readInt(); MutableString doc = new MutableString(); MutableString text = new MutableString(); VirtualDocumentFragment[] fragArray = new VirtualDocumentFragment[ nfrag ]; for ( int i = 0; i < nfrag; i++ ) { doc.readSelfDelimUTF8( rawContent ); text.readSelfDelimUTF8( rawContent ); fragArray[ i ] = new AnchorExtractor.Anchor( doc.copy(), text.copy() ); } result = new ObjectArrayList( fragArray ); } else if ( fieldType( nextFieldToRead ) != FieldType.TEXT ) { result = new ObjectInputStream( rawContent ).readObject(); if ( DEBUG ) LOGGER.debug( "Read " + result + " from field " + fieldName( nextFieldToRead ) + " of object " + title() ); nextFieldToRead++; } else { if ( DEBUG ) LOGGER.debug( "Returning reader for " + field ); result = new Reader() { FastBufferedReader fbr = null; int f = field; public void close() {} public int read( final char[] cbuf, final int off, final int len ) throws IOException { if ( fbr == null ) { if ( DEBUG ) LOGGER.debug( "Initialising reader for content " + f ); MutableString text = new MutableString(); MutableString word = new MutableString(); MutableString nonWord = new MutableString(); do { text.append( word.readSelfDelimUTF8( rawContent ) ); if ( exact ) text.append( nonWord.readSelfDelimUTF8( rawContent ) ); else text.append( ' ' ); } while ( word.length() > 0 || ( exact && nonWord.length() > 0 ) ); fbr = new FastBufferedReader( text ); nextFieldToRead++; } return fbr.read( cbuf, off, len ); } }; } } catch ( IOException e ) { throw new RuntimeException( e ); } catch (ClassNotFoundException e) { throw new RuntimeException( e ); } return result; } public WordReader wordReader( final int field ) { ensureFieldIndex( field ); if ( DEBUG ) LOGGER.debug( "Called wordReader(" + field + ")" ); try { skipToField( field ); } catch ( Exception e ) { throw new RuntimeException( e ); } //logger.debug( "Asked for a new word reader for field " + fieldName( field ) ); switch ( fieldType( field ) ) { case TEXT: return new WordReader() { private static final long serialVersionUID = 1L; public boolean next( final MutableString word, final MutableString nonWord ) throws IOException { try { word.readSelfDelimUTF8( rawContent ); } catch( EOFException e ) { return false; // TODO: a bit raw } nonWord.length( 0 ); if ( exact ) { try { nonWord.readSelfDelimUTF8( rawContent ); } catch( EOFException e ) { return true; // TODO: a bit raw } } else nonWord.append( ' ' ); final boolean goOn = word.length() != 0 || ( exact && nonWord.length() != 0 ); if ( DEBUG ) LOGGER.debug( "Got word <" + word + "|" + nonWord + "> exact=" + exact + " returning " + goOn ); if ( ! goOn ) nextFieldToRead++; return goOn; } public WordReader setReader( final Reader reader ) { return this; } public WordReader copy() { throw new UnsupportedOperationException(); } }; case VIRTUAL: return new FastBufferedReader(); default: return null; } } }; } } private void initZipFile() { try { zipFile = new ZipFile( zipFilename ); } // We leave the possibility for a filename() to fix the problem and load the right zipfile. catch( Exception e ) {} } private void ensureZipFile() { if ( zipFile == null ) throw new IllegalStateException( "The .zip file used by this " + ZipDocumentCollection.class.getSimpleName() + " has not been loaded correctly; please use " + AbstractDocumentSequence.class.getName() + ".load() or call filename() after deserialising this instance" ); } /** Constructs a document collection (for reading) corresponding to a given zip collection file. * * @param zipFilename the filename of the zip collection. * @param underlyingFactory the underlying document factory. * @param numberOfDocuments2 the number of documents. * @param exact true iff this is an exact reproduction of the original sequence. */ public ZipDocumentCollection( final String zipFilename, final DocumentFactory underlyingFactory, final long numberOfDocuments2, final boolean exact ) { this.zipFilename = zipFilename; this.underlyingFactory = underlyingFactory; this.numberOfDocuments = numberOfDocuments2; this.exact = exact; // Creates the factory factory = new ZipFactory( underlyingFactory, exact ); initZipFile(); } @Override public void filename( CharSequence filename ) throws IOException { /* If we don't have a zipFile, we try to get it relatively to the basename. * We also store the resulting filename, so copy() should work. */ if ( zipFile == null ) { zipFilename = new File( new File( filename.toString() ).getParentFile(), zipFilename ).toString(); zipFile = new ZipFile( zipFilename ); } } private void readObject( final ObjectInputStream s ) throws IOException, ClassNotFoundException { s.defaultReadObject(); initZipFile(); } public ZipDocumentCollection copy() { return new ZipDocumentCollection( zipFilename, underlyingFactory, numberOfDocuments, exact ); } public DocumentFactory factory() { return factory; } public long size() { return numberOfDocuments; } private ZipEntry getEntry( final int index ) { ensureDocumentIndex( index ); ensureZipFile(); final ZipEntry entry = zipFile.getEntry( Integer.toString( index ) ); if ( entry == null ) throw new NoSuchElementException( "Failure retrieving entry " + index ); return entry; } public Document document( final long index ) throws IOException { ensureDocumentIndex( index ); final ZipEntry entry = getEntry( (int)index ); final Reference2ObjectMap,Object> metadata = metadata( index, entry ); InputStream is = zipFile.getInputStream( entry ); return factory.getDocument( is, metadata ); } private Reference2ObjectMap,Object> metadata( final long index, ZipEntry entry ) { ensureDocumentIndex( index ); if ( entry == null ) entry = getEntry( (int)index ); final Reference2ObjectArrayMap,Object> metadata = new Reference2ObjectArrayMap,Object>( 1 ); metadata.put( MetadataKeys.TITLE, entry.getComment() ); return metadata; } public Reference2ObjectMap,Object> metadata( final long index ) { return metadata( index, null ); } public InputStream stream( final long index ) throws IOException { ensureDocumentIndex( index ); final ZipEntry entry = getEntry ( (int)index ); entry.getComment(); // Just skip title InputStream is = zipFile.getInputStream( entry ); return is; } public DocumentIterator iterator() { try { return new AbstractDocumentIterator() { final Reference2ObjectArrayMap,Object> metadata = new Reference2ObjectArrayMap,Object>( new Enum[ 1 ], new Object[ 1 ] ); ZipInputStream zis = new ZipInputStream( new FileInputStream( zipFile.getName() ) ); public Document nextDocument() throws IOException { ZipEntry entry; String name; do { entry = zis.getNextEntry(); if ( entry == null ) return null; name = entry.getName(); } while ( !Character.isDigit( name.charAt( 0 ) ) ); String title = entry.getComment(); if ( DEBUG ) LOGGER.debug( "Reading sequentially document " + title + ", name: " + entry.getName() ); InputStream is = zipFile.getInputStream( entry ); metadata.put( MetadataKeys.TITLE, title ); return factory.getDocument( is, metadata ); } }; } catch ( FileNotFoundException e ) { throw new RuntimeException( e ); } } public void close() throws IOException { super.close(); if ( zipFile != null ) zipFile.close(); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy