![JAR search and dependency download from the Maven repository](/logo.png)
org.fbk.cit.hlt.thewikimachine.index.CrossLanguageIndexParser Maven / Gradle / Ivy
/*
* Copyright (2013) Fondazione Bruno Kessler (http://www.fbk.eu/)
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.fbk.cit.hlt.thewikimachine.index;
import org.apache.log4j.Logger;
import org.apache.lucene.document.Document;
import org.fbk.cit.hlt.thewikimachine.ExtractorParameters;
import org.fbk.cit.hlt.thewikimachine.analysis.HardTokenizer;
import org.fbk.cit.hlt.thewikimachine.analysis.Tokenizer;
import org.xerial.snappy.SnappyInputStream;
import java.io.*;
import java.text.DecimalFormat;
import java.util.Date;
import java.util.concurrent.*;
/**
* Created with IntelliJ IDEA.
* User: aprosio
* Date: 1/20/13
* Time: 9:01 PM
* To change this template use File | Settings | File Templates.
*/
public abstract class CrossLanguageIndexParser {
static Logger logger = Logger.getLogger(CrossLanguageIndexParser.class.getName());
private int numThreads;
private ExecutorService myExecutor;
protected BigCrossLanguageSearcher clSchema = null;
public final static int DEFAULT_THREADS_NUMBER = 1;
public final static int DEFAULT_QUEUE_SIZE = 10000;
private static DecimalFormat df = new DecimalFormat("###,###,###,###");
protected int totalNumber = 0;
public abstract void processDocument(Document d, int i) throws IOException;
public CrossLanguageIndexParser() {
this.numThreads = DEFAULT_THREADS_NUMBER;
}
public int getNumThreads() {
return numThreads;
}
public void setNumThreads(int numThreads) {
this.numThreads = numThreads;
}
public void init(String indexFolder) throws IOException {
logger.info(String.format("Creating the thread executor (%d)", numThreads));
int blockQueueSize = DEFAULT_QUEUE_SIZE;
BlockingQueue blockingQueue = new ArrayBlockingQueue(blockQueueSize);
RejectedExecutionHandler rejectedExecutionHandler = new ThreadPoolExecutor.CallerRunsPolicy();
myExecutor = new ThreadPoolExecutor(numThreads, numThreads, 1, TimeUnit.MINUTES, blockingQueue, rejectedExecutionHandler);
logger.info(String.format("Reding folder %s", indexFolder));
clSchema = new BigCrossLanguageSearcher(indexFolder);
}
public void end() throws IOException {
clSchema.close();
}
public class LineProcessor implements Runnable {
private Document d;
private int i;
public LineProcessor(Document d, int i) {
this.d = d;
this.i = i;
}
public void run() {
try {
processDocument(d, i);
} catch (Exception e) {
e.printStackTrace();
logger.error(e.getMessage());
}
}
}
public void go() throws IOException {
long begin = System.currentTimeMillis(), end = 0;
totalNumber = clSchema.getIndexReader().maxDoc();
for (int i = 0; i < clSchema.getIndexReader().maxDoc(); i++) {
Document d = clSchema.getIndexReader().document(i);
if (d == null) {
continue;
}
myExecutor.execute(new LineProcessor(d, i));
if ((i + 1) % 1000 == 0) {
System.out.print(".");
}
if ((i + 1) % 100000 == 0) {
System.out.println(" " + (i + 1) + "/" + totalNumber);
}
}
System.out.println();
logger.info(String.format("Elapsed time: %s", df.format(end - begin)));
try {
myExecutor.shutdown();
logger.debug("Wating to end");
myExecutor.awaitTermination(Long.MAX_VALUE, TimeUnit.NANOSECONDS);
} catch (InterruptedException e) {
logger.error(e);
}
logger.info("Ending process");
}
public void notification(int tot, long begin, long end) {
logger.info(df.format(tot) + "\t" + df.format(end - begin) + "\t" + new Date());
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy