All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.fbk.cit.hlt.thewikimachine.index.CrossLanguageIndexParser Maven / Gradle / Ivy

/*
 * Copyright (2013) Fondazione Bruno Kessler (http://www.fbk.eu/)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.fbk.cit.hlt.thewikimachine.index;

import org.apache.log4j.Logger;
import org.apache.lucene.document.Document;
import org.fbk.cit.hlt.thewikimachine.ExtractorParameters;
import org.fbk.cit.hlt.thewikimachine.analysis.HardTokenizer;
import org.fbk.cit.hlt.thewikimachine.analysis.Tokenizer;
import org.xerial.snappy.SnappyInputStream;

import java.io.*;
import java.text.DecimalFormat;
import java.util.Date;
import java.util.concurrent.*;

/**
 * Created with IntelliJ IDEA.
 * User: aprosio
 * Date: 1/20/13
 * Time: 9:01 PM
 * To change this template use File | Settings | File Templates.
 */
public abstract class CrossLanguageIndexParser {
	static Logger logger = Logger.getLogger(CrossLanguageIndexParser.class.getName());

	private int numThreads;
	private ExecutorService myExecutor;
	protected BigCrossLanguageSearcher clSchema = null;

	public final static int DEFAULT_THREADS_NUMBER = 1;
	public final static int DEFAULT_QUEUE_SIZE = 10000;

	private static DecimalFormat df = new DecimalFormat("###,###,###,###");

	protected int totalNumber = 0;

	public abstract void processDocument(Document d, int i) throws IOException;

	public CrossLanguageIndexParser() {
		this.numThreads = DEFAULT_THREADS_NUMBER;
	}

	public int getNumThreads() {
		return numThreads;
	}

	public void setNumThreads(int numThreads) {
		this.numThreads = numThreads;
	}

	public void init(String indexFolder) throws IOException {
		logger.info(String.format("Creating the thread executor (%d)", numThreads));
		int blockQueueSize = DEFAULT_QUEUE_SIZE;
		BlockingQueue blockingQueue = new ArrayBlockingQueue(blockQueueSize);
		RejectedExecutionHandler rejectedExecutionHandler = new ThreadPoolExecutor.CallerRunsPolicy();
		myExecutor = new ThreadPoolExecutor(numThreads, numThreads, 1, TimeUnit.MINUTES, blockingQueue, rejectedExecutionHandler);

		logger.info(String.format("Reding folder %s", indexFolder));
		clSchema = new BigCrossLanguageSearcher(indexFolder);
	}

	public void end() throws IOException {
		clSchema.close();
	}

	public class LineProcessor implements Runnable {
		private Document d;
		private int i;

		public LineProcessor(Document d, int i) {
			this.d = d;
			this.i = i;
		}

		public void run() {
			try {
				processDocument(d, i);
			} catch (Exception e) {
				e.printStackTrace();
				logger.error(e.getMessage());
			}
		}
	}

	public void go() throws IOException {
		long begin = System.currentTimeMillis(), end = 0;

		totalNumber = clSchema.getIndexReader().maxDoc();
		for (int i = 0; i < clSchema.getIndexReader().maxDoc(); i++) {
			Document d = clSchema.getIndexReader().document(i);
			if (d == null) {
				continue;
			}

			myExecutor.execute(new LineProcessor(d, i));

			if ((i + 1) % 1000 == 0) {
				System.out.print(".");
			}
			if ((i + 1) % 100000 == 0) {
				System.out.println(" " + (i + 1) + "/" + totalNumber);
			}

		}

		System.out.println();

		logger.info(String.format("Elapsed time: %s", df.format(end - begin)));

		try {
			myExecutor.shutdown();
			logger.debug("Wating to end");
			myExecutor.awaitTermination(Long.MAX_VALUE, TimeUnit.NANOSECONDS);
		} catch (InterruptedException e) {
			logger.error(e);
		}


		logger.info("Ending process");
	}

	public void notification(int tot, long begin, long end) {
		logger.info(df.format(tot) + "\t" + df.format(end - begin) + "\t" + new Date());
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy