All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.rdfhdt.hdt.dictionary.impl.section.PFCDictionarySection Maven / Gradle / Ivy

/*
 * File: $HeadURL: https://hdt-java.googlecode.com/svn/trunk/hdt-java/src/org/rdfhdt/hdt/dictionary/impl/section/PFCDictionarySection.java $
 * Revision: $Rev: 201 $
 * Last modified: $Date: 2013-04-17 23:40:20 +0100 (mié, 17 abr 2013) $
 * Last modified by: $Author: mario.arias $
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 *
 * Contacting the authors:
 *   Mario Arias:               [email protected]
 *   Javier D. Fernandez:       [email protected]
 *   Miguel A. Martinez-Prieto: [email protected]
 *   Alejandro Andres:          [email protected]
 */

package org.rdfhdt.hdt.dictionary.impl.section;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.Iterator;

import org.rdfhdt.hdt.compact.integer.VByte;
import org.rdfhdt.hdt.compact.sequence.SequenceLog64;
import org.rdfhdt.hdt.dictionary.DictionarySectionPrivate;
import org.rdfhdt.hdt.dictionary.TempDictionarySection;
import org.rdfhdt.hdt.exceptions.CRCException;
import org.rdfhdt.hdt.exceptions.IllegalFormatException;
import org.rdfhdt.hdt.listener.ProgressListener;
import org.rdfhdt.hdt.options.HDTOptions;
import org.rdfhdt.hdt.util.BitUtil;
import org.rdfhdt.hdt.util.Mutable;
import org.rdfhdt.hdt.util.crc.CRC32;
import org.rdfhdt.hdt.util.crc.CRC8;
import org.rdfhdt.hdt.util.crc.CRCInputStream;
import org.rdfhdt.hdt.util.crc.CRCOutputStream;
import org.rdfhdt.hdt.util.io.IOUtil;
import org.rdfhdt.hdt.util.string.ByteStringUtil;
import org.rdfhdt.hdt.util.string.CompactString;
import org.rdfhdt.hdt.util.string.ReplazableString;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * @author mario.arias
 *
 */
public class PFCDictionarySection implements DictionarySectionPrivate {
	private static final Logger log = LoggerFactory.getLogger(PFCDictionarySection.class);

	public static final int TYPE_INDEX = 2;
	public static final int DEFAULT_BLOCK_SIZE = 16;
	
	// FIXME: Due to java array indexes being int, only 2GB can be addressed per dictionary section.
	protected byte [] text=new byte[0]; // Encoded sequence
	protected int blocksize;
	protected int numstrings;
	protected SequenceLog64 blocks= new SequenceLog64();
	
	public PFCDictionarySection(HDTOptions spec) {
		this.blocksize = (int) spec.getInt("pfc.blocksize");
		if(blocksize==0) {
			blocksize = DEFAULT_BLOCK_SIZE;
		}
	}
	
	/* (non-Javadoc)
	 * @see hdt.dictionary.DictionarySection#load(hdt.dictionary.DictionarySection)
	 */
	@Override
	public void load(TempDictionarySection other, ProgressListener listener) {
		this.blocks = new SequenceLog64(BitUtil.log2(other.size()), other.getNumberOfElements()/blocksize);
		Iterator it = other.getSortedEntries();
		this.load((Iterator)it, other.getNumberOfElements(), listener);
	}
	
	public void load(PFCDictionarySectionBuilder builder) throws IOException {
		builder.finished();
		this.numstrings = builder.getNumstrings();
		this.text = builder.getText();
		this.blocks = builder.getBlocks();
		this.blocksize = builder.getBlocksize();
	}

	public void load(Iterator it, long numentries, ProgressListener listener) {
		this.blocks = new SequenceLog64(32, numentries/blocksize);
		this.numstrings = 0;
		
		
		ByteArrayOutputStream byteOut = new ByteArrayOutputStream(16*1024);
		
		CharSequence previousStr=null;
		
		try {
			while(it.hasNext()) {
				CharSequence str = it.next();

				if(numstrings%blocksize==0) {
					// Add new block pointer
					blocks.append(byteOut.size());

					// Copy full string
					ByteStringUtil.append(byteOut, str, 0);
				} else {
					// Find common part.
					int delta = ByteStringUtil.longestCommonPrefix(previousStr, str);
					// Write Delta in VByte
					VByte.encode(byteOut, delta);
					// Write remaining
					ByteStringUtil.append(byteOut, str, delta);
				}
				byteOut.write(0); // End of string

				numstrings++;
				previousStr = str;
			}
			
			// Ending block pointer.
			blocks.append(byteOut.size());

			// Trim text/blocks
			blocks.aggressiveTrimToSize();

			byteOut.flush();
			text = byteOut.toByteArray();

			// DEBUG
			//dumpAll();
		} catch (IOException e) {
			log.error("Unexpected exception.", e);
		}
	}
		
	protected int locateBlock(CharSequence str) {
		if(blocks.getNumberOfElements()==0) {
			return -1;
		}
		
		int low = 0;
		int high = (int)blocks.getNumberOfElements()-1;
		int max = high;
		
		while (low <= high) {
			int mid = (low + high) >>> 1;
			
			int cmp;
			if(mid==max) {
				cmp = -1;
			} else {
				int pos = (int)blocks.get(mid);
				cmp = ByteStringUtil.strcmp(str, text, pos);
//				System.out.println("Comparing against block: "+ mid + " which is "+ ByteStringUtil.asString(text, pos)+ " Result: "+cmp);
			}
			
			if (cmp<0) {
				high = mid - 1;
			} else if (cmp > 0) {
				low = mid + 1;
			} else {
				return mid; // key found
			}
		}
		return -(low + 1);  // key not found.
	}
	
	
	/* (non-Javadoc)
	 * @see hdt.dictionary.DictionarySection#locate(java.lang.CharSequence)
	 */
	@Override
	public long locate(CharSequence str) {
		if(text==null || blocks==null) {
			return 0;
		}
		
		int blocknum = locateBlock(str);
		if(blocknum>=0) {
			// Located exactly
			return (blocknum*blocksize)+1;
		} else {
			// Not located exactly.
			blocknum = -blocknum-2;
			
			if(blocknum>=0) {
				int idblock = locateInBlock(blocknum, str);

				if(idblock != 0) {
					return (blocknum*blocksize)+idblock+1;
				}
			}
		}
		
		return 0;
	}
	
	public int locateInBlock(int block, CharSequence str) {
		if(block>=blocks.getNumberOfElements()) {
			return 0;
		}
		
		int pos = (int)blocks.get(block);
		ReplazableString tempString = new ReplazableString();
		
		Mutable delta = new Mutable<>(0L);
		int idInBlock = 0;
		int cshared=0;
		
//		dumpBlock(block);
		
		// Read the first string in the block
		int slen = ByteStringUtil.strlen(text, pos);
		tempString.append(text, pos, slen);
		pos+=slen+1;
		idInBlock++;
		
		while( (idInBlock=cshared)
			{
				// Current delta value means that this string
				// has a larger long common prefix than the previous one
				cshared += ByteStringUtil.longestCommonPrefix(tempString, str, cshared);
				
				if((cshared==str.length()) && (tempString.length()==str.length())) {
					break;
				}
			} else {
				// We have less common characters than before, 
				// this string is bigger that what we are looking for.
				// i.e. Not found.
				idInBlock = 0;
				break;
			}
			pos+=slen+1;
			idInBlock++;
			
		}

		if(pos>=text.length || idInBlock== blocksize) {
			idInBlock=0;
		}
		
		return idInBlock;
	}
	
	/* (non-Javadoc)
	 * @see hdt.dictionary.DictionarySection#extract(int)
	 */
	@Override
	public CharSequence extract(long id) {
		if(text==null || blocks==null) {
			return null;
		}
		
		if(id<1 || id>numstrings) {
			return null;
		}
		
		int block = (int) ((id-1)/blocksize);
		int stringid = (int) ((id-1)%blocksize);
		int pos = (int) blocks.get(block);
 		int len = ByteStringUtil.strlen(text, pos);
		
		Mutable delta = new Mutable<>(0L);
		ReplazableString tempString = new ReplazableString();
		tempString.append(text, pos, len);
		
		for(int i=0;i=blocks.getNumberOfElements()) {
//			return;
//		}
//		
//		System.out.println("Dump block "+block);
//		ReplazableString tempString = new ReplazableString();
//		Mutable delta = new Mutable(0);
//		int idInBlock = 0;
//			
//		int pos = (int)blocks.get(block);
//		
//		// Copy first string
//		int len = ByteStringUtil.strlen(text, pos);
//		tempString.append(text, pos, len);
//		pos+=len+1;
//		
//		System.out.println((block*blocksize+idInBlock)+ " ("+idInBlock+") => "+ tempString);
//		idInBlock++;
//		
//		while( (idInBlock "+ tempString + " Delta="+delta.getValue()+ " Len="+len);
//			
//			pos+=len+1;
//			idInBlock++;
//		}
//	}

	/* (non-Javadoc)
	 * @see hdt.dictionary.DictionarySection#size()
	 */
	@Override
	public long size() {
		return text.length+blocks.size();
	}

	/* (non-Javadoc)
	 * @see hdt.dictionary.DictionarySection#getNumberOfElements()
	 */
	@Override
	public long getNumberOfElements() {
		return numstrings;
	}

	/* (non-Javadoc)
	 * @see hdt.dictionary.DictionarySection#getEntries()
	 */
	@Override
	public Iterator getSortedEntries() {
		return new Iterator() {
			int id;
			int pos;
			final Mutable delta = new Mutable<>(0L);
			final ReplazableString tempString = new ReplazableString();

			@Override
			public boolean hasNext() {
				return idInteger.MAX_VALUE) {
			input.reset();
			throw new IllegalArgumentException("This class cannot process files with a packed buffer bigger than 2GB"); 
		}
		
		// Read blocks
		blocks = new SequenceLog64();
		blocks.load(input, listener);	// Read blocks from input, they have their own CRC check.
		
		// Read packed data
		in.setCRC(new CRC32());
		text = IOUtil.readBuffer(in, (int) bytes, listener);
		if(!in.readCRCAndCheck()) {
			throw new CRCException("CRC Error while reading Dictionary Section Plain Front Coding Data.");
		}
	}

	@Override
	public void close() throws IOException {
		text=null;
		blocks.close();
		blocks=null;
	}
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy