All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.dkpro.similarity.uima.vsm.esaindexer.EsaIndexer Maven / Gradle / Ivy

The newest version!
/*******************************************************************************
 * Copyright 2013 Mateusz Parzonka
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *******************************************************************************/
package org.dkpro.similarity.uima.vsm.esaindexer;

import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine;
import static org.apache.uima.fit.factory.CollectionReaderFactory.createReader;

import java.io.File;
import java.io.IOException;

import org.apache.uima.UIMAException;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.collection.CollectionReader;
import org.apache.uima.fit.pipeline.SimplePipeline;

import de.tudarmstadt.ukp.dkpro.core.io.jwpl.WikipediaReaderBase;
import de.tudarmstadt.ukp.dkpro.core.snowball.SnowballStemmer;
import de.tudarmstadt.ukp.dkpro.core.tokit.BreakIteratorSegmenter;
import de.tudarmstadt.ukp.wikipedia.api.WikiConstants.Language;

/**
 * Indexes Wikipedia and creates an inverted index to be used with ESA. 
 * 
 * @author Mateusz Parzonka
 *
 */
public class EsaIndexer {
	
	private final static String luceneIndexPath = "target/lucene";
	private final static String esaIndexPath = "target/esa";
	
	public static void main(String[] args)
		throws Exception
	{
		createLuceneWikipediaIndex("de");
		createInvertedIndex();
	}

	/**
	 * Creates a Lucene index from Wikipedia based on lower cased stems with length >=3 containing only characters.
	 * 
	 * @throws UIMAException
	 * @throws IOException
	 */
	private static void createLuceneWikipediaIndex(String language)
		throws UIMAException, IOException
	{
		CollectionReader reader = createReader(
				ExtendedWikipediaArticleReader.class,
				WikipediaReaderBase.PARAM_HOST, "YOUR_HOST",
				WikipediaReaderBase.PARAM_DB, "YOUR_DB",
				WikipediaReaderBase.PARAM_USER, "YOUR_USERNAME",
				WikipediaReaderBase.PARAM_PASSWORD, "YOUR_PASSWORD",
				WikipediaReaderBase.PARAM_LANGUAGE, Language.german);
		
		AnalysisEngine segmenter = createEngine(
				BreakIteratorSegmenter.class,
				BreakIteratorSegmenter.PARAM_LANGUAGE, language);
		
		AnalysisEngine stemmer = createEngine(
				SnowballStemmer.class,
				SnowballStemmer.PARAM_LANGUAGE, language,
				SnowballStemmer.PARAM_LOWER_CASE, true);
		
		AnalysisEngine indexTermGenerator = createEngine(
				LuceneIndexer.class,
				LuceneIndexer.PARAM_INDEX_PATH, luceneIndexPath,
				LuceneIndexer.PARAM_MIN_TERMS_PER_DOCUMENT, 50);

		SimplePipeline.runPipeline(reader, segmenter, stemmer, indexTermGenerator);
		
	}
	
	/**
	 * Creates an inverted index for ESA
	 * 
	 * @throws Exception
	 */
	private static void createInvertedIndex()
		throws Exception
	{
		IndexInverter indexInverter = new IndexInverter(new File(luceneIndexPath), new File(esaIndexPath));
		indexInverter.setMinDocumentFrequency(1);
		indexInverter.createInvertedIndex();
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy