All Downloads are FREE. Search and download functionalities are using the official Maven repository.

eu.fbk.twm.utils.ExtractorParameters Maven / Gradle / Ivy

The newest version!
/*
 * Copyright (2013) Fondazione Bruno Kessler (http://www.fbk.eu/)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package eu.fbk.twm.utils;

import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;

import java.io.File;
import java.util.HashMap;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Created with IntelliJ IDEA.
 * User: giuliano
 * Date: 1/21/13
 * Time: 9:29 AM
 * To change this template use File | Settings | File Templates.
 */
public class ExtractorParameters {
	/**
	 * Define a static logger variable so that it references the
	 * Logger instance named ExtractorParameters.
	 */
	static Logger logger = Logger.getLogger(ExtractorParameters.class.getName());

	public static final String OUTPUT_EXT = ".csv";

	String version;

	String lang;

	public String getWikipediaPageTopicsIndexName() {
		return wikipediaPageTopicsIndexName;
	}

	public String getWikipediaPageTopicsFileName() {
		return wikipediaPageTopicsFileName;
	}

	String wikipediaXmlFileName;

	String extractionOutputDirName;

	public String getExtractionOutputDirName() {
		return extractionOutputDirName;
	}

	String wikipediaDirName;

	String wikipediaOutputFilePrefixName;

	String wikipediaRedirFileName;
	String wikipediaRedirIndexName;

	public String getWikipediaRedirIndexName() {
		return wikipediaRedirIndexName;
	}

	public String getWikipediaPageCategoryMainSortedCategoryFileName() {
		return wikipediaPageCategoryMainSortedCategoryFileName;
	}

	public String getWikipediaPageCategoryMainFileName() {
		return wikipediaPageCategoryMainFileName;
	}

	String wikipediaDisambiguationFileName;
	String wikipediaDisambiguationIndexName;

	String wikipediaAnalysisFileName;

	String wikipediaTitleIdFileName;

	String wikipediaSeeAlsoFileName;

	String wikipediaSeeAlsoIndexName;

	String commonsFileName;

	String wikipediaFileName;

	String wikipediaFileSourceName;

	String wikipediaPageFreqIndexName;

	String wikipediaPageTrafficFileName;

	String wikipediaPersonInfoFileName;

	String wikipediaPersonInfoIndexName;

	String wikipediaOutgoingFileName;

	String wikipediaPageCategoryPrefix;

	String wikipediaPageTopCategoryIndexName;

	String wikipediaPageCategoryFileName;
	String wikipediaPageCategoryXmlIndex;

	public String getWikipediaPageCategoryXmlIndex() {
		return wikipediaPageCategoryXmlIndex;
	}

	String wikipediaPageCategoryMainFileName;
	String wikipediaPageCategoryMainSortedCategoryFileName;

	String wikipediaPageCategoryIndexName;

	String wikipediaPageTopCategoryFileName;

	String wikipediaCategorySuperCategoryIndexName;

	String wikipediaCategorySubCategoryIndexName;

	String wikipediaCategoryPageIndexName;

	String wikipediaFormIdFileName;

	HashMap wikipediaCategoryFileNames = new HashMap();

	String wikipediaCategorySuperCategoryFileName;

	String wikipediaCategorySubCategoryFileName;

	String wikipediaCategoryPageFileName;

	String wikipediaCrossLanguageLinkFileName;

	String wikipediaTemplateFilePrefixName;
	HashMap wikipediaTemplateFileNames = new HashMap();

	String wikipediaInfoboxFileName;

	String wikipediaFirstNameFileName;

	String wikipediaExampleFileName;

	//WikipediaSectionExtractor filenames
	String wikipediaSectionFileName;
	String wikipediaPageSectionFreqFileName;
	String wikipediaSectionFreqFileName;

	String wikipediaSectionTitlePrefix;
	String wikipediaSectionTitleFileName;
	HashMap wikipediaSectionTitleFileNames = new HashMap();

	String wikipediaSortedPageFileName;

	String wikipediaSortedFormFileName;

	String wikipediaIncomingFileName;

	String wikipediaFormFreqFileName;

	String wikipediaPageFreqFileName;

	String wikipediaTextFileName;

	String wikipediaNGramFileName;

	String wikipediaVectorFileName;

	String wikipediaDBPediaFileName;

	String wikipediaFilteredExampleFileName;

	String wikipediaFirstNameIndexName;

	String wikipediaPageFormIndexName;

	String wikipediaFormPageIndexName;

	String wikipediaNGramIndexName;

	String wikipediaTypeIndexName;

	String wikipediaCrossLanguageLinkIndexName;

	String extractionLogFileName;

	String preprocessingLogFileName;

	String sortingLogFileName;

	String indexingLogFileName;

	String wikipediaVectorIndexName;

	String vectorLogFileName;

	String outgoingLogFileName;

	String wikipediaContentPageFileName;

	String wikipediaOutgoingIndexName;

	String wikipediaIncomingIndexName;

	String wikipediaIncomingOutgoingFileName;

	String wikipediaIncomingOutgoingWeightedIndexName;

	String wikipediaIncomingOutgoingIndexName;

	String incomingOutgoingLogFileName;

	String wikipediaSortedIncomingOutgoingFileName;

	String wikipediaUnigramFileName;

	String oneExamplePerSenseFileName;

	String oneExamplePerSenseIndexName;

	String wikipediaTextIndexName;

	String wikipediaFileSourceIndexName;

	String wikipediaPageAirPediaClassIndexName;

	String wikipediaAbstractIndexName;
	String wikipediaAbstractFileName;
	String wikipediaFirstSentenceFileName;
	String wikipediaFirstSentenceIndexName;

	public String getWikipediaFirstSentenceFileName() {
		return wikipediaFirstSentenceFileName;
	}

	public String getWikipediaFirstSentenceIndexName() {
		return wikipediaFirstSentenceIndexName;
	}

	String wikipediaPageAllCategoryFileName;

	String wikipediaPagePerCategoryCountFileName;

	String wikipediaNomFileName;
	String wikipediaCategoryFileName;

	String wikipediaPageTypeFileName;

	String wikipediaPageTypeIndexName;

	String wikipediaPageNavigationTemplateIndexName;
	String wikipediaPageNavigationTemplateFileName;
	String wikipediaPagePortalIndexName;
	String wikipediaPagePortalFileName;

	String wikipediaPageTopicsIndexName;
	String wikipediaPageTopicsFileName;

	String wikipediaDBPediaClassesIndexName;
	String wikipediaNamNomIndexName;
	String wikipediaAirpedia2IndexName;

	String wikipediCategoryIndexFileName;
	String wikipediaSortedPagePerCategoryCountFileName;
	String wikipediaPageAllCategoryIndexName;

	public String getWikipediaAirpedia2IndexName() {
		return wikipediaAirpedia2IndexName;
	}

	public String getWikipediaNamNomIndexName() {
		return wikipediaNamNomIndexName;
	}

	public String getWikipediaPageNavigationTemplateIndexName() {
		return wikipediaPageNavigationTemplateIndexName;
	}

	public String getWikipediaPageNavigationTemplateFileName() {
		return wikipediaPageNavigationTemplateFileName;
	}

	public String getWikipediaPagePortalIndexName() {
		return wikipediaPagePortalIndexName;
	}

	public String getWikipediaPagePortalFileName() {
		return wikipediaPagePortalFileName;
	}

	Locale locale;

	public HashMap getWikipediaSectionTitleFileNames() {
		return wikipediaSectionTitleFileNames;
	}

	public HashMap getWikipediaTemplateFileNames() {
		return wikipediaTemplateFileNames;
	}

	public HashMap getWikipediaCategoryFileNames() {
		return wikipediaCategoryFileNames;
	}

	protected void parseXmlFileName(String wikipediaXmlFileName) {
		if (this.wikipediaXmlFileName == null) {
			this.wikipediaXmlFileName = wikipediaXmlFileName;
			logger.debug(wikipediaXmlFileName);
			File wikipediaXmlFile = new File(wikipediaXmlFileName);
			wikipediaDirName = wikipediaXmlFile.getParent();

			String name = wikipediaXmlFile.getName();
			//[lang]wiki-[version]-pages-articles.xml
			logger.debug(name);
			Pattern pattern = Pattern.compile("(\\w+)wiki-(\\d+)-pages-articles.xml");
			Matcher matcher = pattern.matcher(name);
			if (matcher.find()) {
				lang = name.substring(matcher.start(1), matcher.end(1));
				locale = new Locale(lang);
				version = name.substring(matcher.start(2), matcher.end(2));
			}
		}
	}

	public ExtractorParameters(String wikipediaXmlFileName, String baseDir, boolean useAsBaseDir) {
		parseXmlFileName(wikipediaXmlFileName);

		if (!baseDir.endsWith(File.separator)) {
			baseDir += File.separator;
		}

		if (useAsBaseDir) {
			setNames(baseDir + lang + File.separator + version);
		}
		else {
			setNames(baseDir);
		}
	}

	public ExtractorParameters(String wikipediaXmlFileName, String extractionOutputDirName) {
		this(wikipediaXmlFileName, extractionOutputDirName, false);
	}

	/*public ExtractorParameters(String wikipediaXmlFileName) {
		this(wikipediaXmlFileName, System.getProperty("user.dir"), false);
	}*/

	public String getWikipediaDBPediaClassesIndexName() {
		return wikipediaDBPediaClassesIndexName;
	}

	public String getWikipediaDisambiguationIndexName() {
		return wikipediaDisambiguationIndexName;
	}

	public void setWikipediaDisambiguationIndexName(String wikipediaDisambiguationIndexName) {
		this.wikipediaDisambiguationIndexName = wikipediaDisambiguationIndexName;
	}

	protected void setNames(String extractionOutputDirName) {
		this.extractionOutputDirName = extractionOutputDirName;

		if (!this.extractionOutputDirName.endsWith(File.separator)) {
			this.extractionOutputDirName += File.separator;
		}


		wikipediaOutputFilePrefixName = this.extractionOutputDirName + lang + "wiki-" + version + '-';
		wikipediaAnalysisFileName = wikipediaOutputFilePrefixName + "analysis.csv";

		wikipediaRedirFileName = wikipediaOutputFilePrefixName + "redirect.csv";
		wikipediaRedirIndexName = wikipediaOutputFilePrefixName + "redirect-index";

		wikipediaDisambiguationFileName = wikipediaOutputFilePrefixName + "disambiguation.csv";
		wikipediaDisambiguationIndexName = wikipediaOutputFilePrefixName + "disambiguation-index-raw";
		wikipediaTitleIdFileName = wikipediaOutputFilePrefixName + "title-id.csv";
		wikipediaPageTrafficFileName = wikipediaOutputFilePrefixName + "page-traffic.csv";
		wikipediaContentPageFileName = wikipediaOutputFilePrefixName + "content-page.csv";
		wikipediaTextFileName = wikipediaOutputFilePrefixName + "text.csv";
		wikipediaVectorFileName = wikipediaOutputFilePrefixName + "vector.csv";
		wikipediaVectorIndexName = wikipediaOutputFilePrefixName + "page-vector-index";
		wikipediaPersonInfoFileName = wikipediaOutputFilePrefixName + "person-info.csv";
		wikipediaPersonInfoIndexName = wikipediaOutputFilePrefixName + "person-info-index";
		wikipediaFirstNameFileName = wikipediaOutputFilePrefixName + "first-name.csv";
		wikipediaOutgoingFileName = wikipediaOutputFilePrefixName + "outgoing.csv";
		wikipediaFileName = wikipediaOutputFilePrefixName + "file.csv";
		commonsFileName = wikipediaOutputFilePrefixName + "file.csv";
		wikipediaAbstractFileName = wikipediaOutputFilePrefixName + "abstract.csv";
		wikipediaFirstSentenceFileName = wikipediaOutputFilePrefixName + "links-first-sentence.csv";
		wikipediaSeeAlsoFileName = wikipediaOutputFilePrefixName + "page-see-also.csv";
		wikipediaSeeAlsoIndexName = wikipediaOutputFilePrefixName + "page-see-also-index";
		wikipediaAbstractIndexName = wikipediaOutputFilePrefixName + "abstract-index";
		wikipediaFirstSentenceIndexName = wikipediaOutputFilePrefixName + "links-first-sentence-index";
		wikipediaFileSourceName = wikipediaOutputFilePrefixName + "page-file-source.csv";
		wikipediaFileSourceIndexName = wikipediaOutputFilePrefixName + "page-file-source-index";
		wikipediaIncomingOutgoingFileName = wikipediaOutputFilePrefixName + "incoming-outgoing.csv";
		wikipediaSortedIncomingOutgoingFileName = wikipediaOutputFilePrefixName + "sorted-incoming-outgoing.csv";
		wikipediaPageCategoryPrefix = wikipediaOutputFilePrefixName + "page-category";
		wikipediaPageTopCategoryFileName = wikipediaOutputFilePrefixName + "page-top-category.csv";
		wikipediaPageAllCategoryFileName = wikipediaOutputFilePrefixName + "page-all-category.csv";
		wikipediaPageAllCategoryIndexName = wikipediaOutputFilePrefixName + "page-all-category-index";
		wikipediaPagePerCategoryCountFileName = wikipediaOutputFilePrefixName + "page-per-category-count.csv";
		wikipediaSortedPagePerCategoryCountFileName = wikipediaOutputFilePrefixName + "sorted-page-per-category-count.csv";
		wikipediCategoryIndexFileName = wikipediaOutputFilePrefixName + "category-index.csv";
		wikipediaPageTopCategoryIndexName = wikipediaOutputFilePrefixName + "page-top-category-index";
		wikipediaPageCategoryFileName = wikipediaPageCategoryPrefix + ".csv";
		wikipediaPageCategoryIndexName = wikipediaOutputFilePrefixName + "page-category-index";
		wikipediaPageCategoryMainFileName = wikipediaPageCategoryPrefix + "-main.csv";
		wikipediaPageCategoryMainSortedCategoryFileName = wikipediaPageCategoryPrefix + "-main-sorted-category.csv";
		wikipediaPageCategoryXmlIndex = wikipediaPageCategoryPrefix + "-xml-index-raw";
		wikipediaCategorySuperCategoryFileName = wikipediaOutputFilePrefixName + "category-super-category.csv";
		wikipediaCategorySuperCategoryIndexName = wikipediaOutputFilePrefixName + "category-super-category-index";

		wikipediaCategorySubCategoryFileName = wikipediaOutputFilePrefixName + "category-sub-category.csv";
		wikipediaCategorySubCategoryIndexName = wikipediaOutputFilePrefixName + "category-sub-category-index";

		wikipediaCategoryPageFileName = wikipediaOutputFilePrefixName + "category-page.csv";
		wikipediaCategoryPageIndexName = wikipediaOutputFilePrefixName + "category-page-index";

		wikipediaCategoryFileName = wikipediaOutputFilePrefixName + "category.csv";

		wikipediaCrossLanguageLinkFileName = wikipediaOutputFilePrefixName + "cross-lang.csv";
		wikipediaTemplateFilePrefixName = wikipediaOutputFilePrefixName + "template-";
		wikipediaExampleFileName = wikipediaOutputFilePrefixName + "example.csv";
		wikipediaFilteredExampleFileName = wikipediaOutputFilePrefixName + "filtered-example.csv";
		wikipediaSortedPageFileName = wikipediaOutputFilePrefixName + "sorted-page.csv";
		wikipediaNomFileName = wikipediaOutputFilePrefixName + "nom.csv";
		wikipediaSortedFormFileName = wikipediaOutputFilePrefixName + "sorted-form.csv";

		//WikipediaSectionExtractor filenames
		wikipediaSectionFileName = wikipediaOutputFilePrefixName + "section-text" + OUTPUT_EXT;
		wikipediaPageSectionFreqFileName = wikipediaOutputFilePrefixName + "page-section-freq" + OUTPUT_EXT;
		wikipediaSectionFreqFileName = wikipediaOutputFilePrefixName + "section-freq" + OUTPUT_EXT;

		wikipediaSectionTitlePrefix = wikipediaOutputFilePrefixName + "section-title";
		wikipediaSectionTitleFileName = wikipediaSectionTitlePrefix + ".csv";
		wikipediaIncomingFileName = wikipediaOutputFilePrefixName + "incoming.csv";
		wikipediaFormFreqFileName = wikipediaOutputFilePrefixName + "form-freq.csv";
		wikipediaPageFreqFileName = wikipediaOutputFilePrefixName + "page-freq.csv";
		wikipediaPageFreqIndexName = wikipediaOutputFilePrefixName + "page-freq-index";
		wikipediaFormIdFileName = wikipediaOutputFilePrefixName + "form-id.csv";

		wikipediaInfoboxFileName = wikipediaOutputFilePrefixName + "infobox.csv";
		wikipediaDBPediaFileName = wikipediaOutputFilePrefixName + "dbpedia.csv";
		wikipediaPageFormIndexName = wikipediaOutputFilePrefixName + "page-form-index";
		wikipediaFirstNameIndexName = wikipediaOutputFilePrefixName + "first-name-index";
		wikipediaFormPageIndexName = wikipediaOutputFilePrefixName + "form-page-index";
		wikipediaOutgoingIndexName = wikipediaOutputFilePrefixName + "outgoing-index";
		wikipediaIncomingIndexName = wikipediaOutputFilePrefixName + "incoming-index";
		wikipediaIncomingOutgoingIndexName = wikipediaOutputFilePrefixName + "incoming-outgoing-index";
		wikipediaIncomingOutgoingWeightedIndexName = wikipediaOutputFilePrefixName + "incoming-outgoing-weighted-index";
		wikipediaNGramFileName = wikipediaOutputFilePrefixName + "ngram.csv";
		wikipediaUnigramFileName = wikipediaOutputFilePrefixName + "unigram.csv";
		wikipediaNGramIndexName = wikipediaOutputFilePrefixName + "ngram-index";
		wikipediaTypeIndexName = wikipediaOutputFilePrefixName + "type-index";
		wikipediaPageTypeIndexName = wikipediaOutputFilePrefixName + "page-type-index";
		wikipediaPageTypeFileName = wikipediaOutputFilePrefixName + "page-type.csv";
		preprocessingLogFileName = wikipediaOutputFilePrefixName + "preprocessing.log";
		vectorLogFileName = wikipediaOutputFilePrefixName + "lsa.log";
		extractionLogFileName = wikipediaOutputFilePrefixName + "extraction.log";
		sortingLogFileName = wikipediaOutputFilePrefixName + "sorting.log";
		indexingLogFileName = wikipediaOutputFilePrefixName + "indexing.log";
		outgoingLogFileName = wikipediaOutputFilePrefixName + "outgoing.log";
		incomingOutgoingLogFileName = wikipediaOutputFilePrefixName + "incoming-outgoing.log";
		oneExamplePerSenseFileName = wikipediaOutputFilePrefixName + "one-example-per-sense.csv";
		oneExamplePerSenseIndexName = wikipediaOutputFilePrefixName + "one-example-per-sense-index";
		wikipediaTextIndexName = wikipediaOutputFilePrefixName + "text-index";
		wikipediaPageAirPediaClassIndexName = wikipediaOutputFilePrefixName + "airpedia-class-index";

		wikipediaTemplateFileNames.put("name", wikipediaTemplateFilePrefixName + "name.csv");
		wikipediaTemplateFileNames.put("freq", wikipediaTemplateFilePrefixName + "freq.csv");
		wikipediaTemplateFileNames.put("map", wikipediaTemplateFilePrefixName + "map.csv");
		wikipediaTemplateFileNames.put("map-rep", wikipediaTemplateFilePrefixName + "map-rep.csv");
		wikipediaTemplateFileNames.put("map-prop", wikipediaTemplateFilePrefixName + "map-prop.csv");
		wikipediaTemplateFileNames.put("complete", wikipediaTemplateFilePrefixName + "complete.csv");
		wikipediaTemplateFileNames.put("good", wikipediaTemplateFilePrefixName + "good.csv");
		wikipediaTemplateFileNames.put("pruned", wikipediaTemplateFilePrefixName + "pruned.csv");
		wikipediaTemplateFileNames.put("pruned-s-page", wikipediaTemplateFilePrefixName + "pruned-s-page.csv");
		wikipediaTemplateFileNames.put("pruned-s-tpl", wikipediaTemplateFilePrefixName + "pruned-s-tpl.csv");
		wikipediaTemplateFileNames.put("index-p2t", wikipediaTemplateFilePrefixName + "index-p2t");
		wikipediaTemplateFileNames.put("index-t2p", wikipediaTemplateFilePrefixName + "index-t2p");
		wikipediaTemplateFileNames.put("index-id", wikipediaTemplateFilePrefixName + "index-id");
		wikipediaTemplateFileNames.put("properties", wikipediaTemplateFilePrefixName + "properties-index");
		wikipediaTemplateFileNames.put("infoboxes", wikipediaTemplateFilePrefixName + "infoboxes.csv");
		wikipediaTemplateFileNames.put("navigation", wikipediaTemplateFilePrefixName + "navigation.csv");

		wikipediaPagePortalIndexName = wikipediaOutputFilePrefixName + "page-portal-index";
		wikipediaPageNavigationTemplateIndexName = wikipediaOutputFilePrefixName + "page-navigation-index";
		wikipediaPagePortalFileName = wikipediaOutputFilePrefixName + "page-portal.csv";
		wikipediaPageNavigationTemplateFileName = wikipediaOutputFilePrefixName + "page-navigation.csv";

		wikipediaDBPediaClassesIndexName = wikipediaOutputFilePrefixName + "page-dbpediaclass-index";

//		wikipediaTemplateFileNames.put("portals", wikipediaTemplateFilePrefixName + "portals.csv");
//		wikipediaTemplateFileNames.put("page-navigation", wikipediaTemplateFilePrefixName + "page-navigation.csv");
//		wikipediaTemplateFileNames.put("index-portal", wikipediaTemplateFilePrefixName + "index-portal");
//		wikipediaTemplateFileNames.put("index-navigation", wikipediaTemplateFilePrefixName + "index-navigation");

		wikipediaCategoryFileNames.put("s-cat", wikipediaPageCategoryPrefix + "-s-cat.csv");
		wikipediaCategoryFileNames.put("index-p2c", wikipediaPageCategoryPrefix + "-index-p2c");
		wikipediaCategoryFileNames.put("index-c2p", wikipediaPageCategoryPrefix + "-index-c2p");
		wikipediaCategoryFileNames.put("index-id", wikipediaPageCategoryPrefix + "-index-id");
		wikipediaCategoryFileNames.put("tokens", wikipediaPageCategoryPrefix + "-tokens.csv");
		wikipediaCategoryFileNames.put("tokens-s-tok", wikipediaPageCategoryPrefix + "-tokens-s-tok.csv");
		wikipediaCategoryFileNames.put("tokens-index-p2k", wikipediaPageCategoryPrefix + "-tokens-index-p2k");
		wikipediaCategoryFileNames.put("tokens-index-k2p", wikipediaPageCategoryPrefix + "-tokens-index-k2p");
		wikipediaCategoryFileNames.put("tokens-index-id", wikipediaPageCategoryPrefix + "-tokens-index-id");

		wikipediaSectionTitleFileNames.put("s-sec", wikipediaSectionTitlePrefix + "-s-sec.csv");
		wikipediaSectionTitleFileNames.put("index-p2s", wikipediaSectionTitlePrefix + "-index-p2s");
		wikipediaSectionTitleFileNames.put("index-s2p", wikipediaSectionTitlePrefix + "-index-s2p");
		wikipediaSectionTitleFileNames.put("index-id", wikipediaSectionTitlePrefix + "-index-id");
		wikipediaSectionTitleFileNames.put("tokens", wikipediaSectionTitlePrefix + "-tokens.csv");
		wikipediaSectionTitleFileNames.put("tokens-s-tok", wikipediaSectionTitlePrefix + "-tokens-s-tok.csv");
		wikipediaSectionTitleFileNames.put("tokens-index-p2k", wikipediaSectionTitlePrefix + "-tokens-index-p2k");
		wikipediaSectionTitleFileNames.put("tokens-index-k2p", wikipediaSectionTitlePrefix + "-tokens-index-k2p");
		wikipediaSectionTitleFileNames.put("tokens-index-id", wikipediaSectionTitlePrefix + "-tokens-index-id");

		// Links to global models
		wikipediaNamNomIndexName = wikipediaOutputFilePrefixName + "page-namnom-index";
		wikipediaAirpedia2IndexName = wikipediaOutputFilePrefixName + "page-airpedia2class-index";
		wikipediaCrossLanguageLinkIndexName = wikipediaOutputFilePrefixName + "cross-lang-index";
		wikipediaPageTopicsIndexName = wikipediaOutputFilePrefixName + "page-topics-index";
		wikipediaPageTopicsFileName = wikipediaOutputFilePrefixName + "page-topics.csv";
	}

	public String getWikipediaSectionFreqFileName() {
		return wikipediaSectionFreqFileName;
	}

	public String getWikipediaCategoryPageFileName() {
		return wikipediaCategoryPageFileName;
	}

	public String getWikipediaCategoryPageIndexName() {
		return wikipediaCategoryPageIndexName;
	}

	public String getWikipediaCategoryFileName() {
		return wikipediaCategoryFileName;
	}


	public String getWikipediaPageAllCategoryIndexName() {
		return wikipediaPageAllCategoryIndexName;
	}

	public String getWikipediaSortedPagePerCategoryCountFileName() {
		return wikipediaSortedPagePerCategoryCountFileName;
	}

	public String getWikipediCategoryIndexFileName() {
		return wikipediCategoryIndexFileName;
	}

	public String getWikipediaPageTypeIndexName() {
		return wikipediaPageTypeIndexName;
	}

	public String getWikipediaPageTypeFileName() {
		return wikipediaPageTypeFileName;
	}

	public String getWikipediaNomFileName() {
		return wikipediaNomFileName;
	}

	public String getWikipediaPagePerCategoryCountFileName() {
		return wikipediaPagePerCategoryCountFileName;
	}

	public String getWikipediaSeeAlsoFileName() {
		return wikipediaSeeAlsoFileName;
	}

	public String getWikipediaSeeAlsoIndexName() {
		return wikipediaSeeAlsoIndexName;
	}

	public String getWikipediaPageAllCategoryFileName() {
		return wikipediaPageAllCategoryFileName;
	}

	public String getWikipediaPageTopCategoryFileName() {
		return wikipediaPageTopCategoryFileName;
	}

	public String getWikipediaPageTopCategoryIndexName() {
		return wikipediaPageTopCategoryIndexName;
	}

	public String getWikipediaPageFreqIndexName() {
		return wikipediaPageFreqIndexName;
	}

	public void setWikipediaPageFreqIndexName(String wikipediaPageFreqIndexName) {
		this.wikipediaPageFreqIndexName = wikipediaPageFreqIndexName;
	}

	public String getWikipediaFormIdFileName() {
		return wikipediaFormIdFileName;
	}

	public void setWikipediaFormIdFileName(String wikipediaFormIdFileName) {
		this.wikipediaFormIdFileName = wikipediaFormIdFileName;
	}

	public String getWikipediaIncomingOutgoingWeightedIndexName() {
		return wikipediaIncomingOutgoingWeightedIndexName;
	}

	public void setWikipediaIncomingOutgoingWeightedIndexName(String wikipediaIncomingOutgoingWeightedIndexName) {
		this.wikipediaIncomingOutgoingWeightedIndexName = wikipediaIncomingOutgoingWeightedIndexName;
	}

	public String getWikipediaPageTrafficFileName() {
		return wikipediaPageTrafficFileName;
	}

	public void setWikipediaPageTrafficFileName(String wikipediaPageTrafficFileName) {
		this.wikipediaPageTrafficFileName = wikipediaPageTrafficFileName;
	}

	public String getWikipediaCategorySubCategoryFileName() {
		return wikipediaCategorySubCategoryFileName;
	}

	public String getWikipediaCategorySubCategoryIndexName() {
		return wikipediaCategorySubCategoryIndexName;
	}

	public String getWikipediaCategorySuperCategoryIndexName() {
		return wikipediaCategorySuperCategoryIndexName;
	}

	public String getWikipediaPageCategoryIndexName() {
		return wikipediaPageCategoryIndexName;
	}

	public String getWikipediaPersonInfoIndexName() {
		return wikipediaPersonInfoIndexName;
	}

	public String getWikipediaAbstractIndexName() {
		return wikipediaAbstractIndexName;
	}

	public String getWikipediaAbstractFileName() {
		return wikipediaAbstractFileName;
	}

	public String getWikipediaPageAirPediaClassIndexName() {
		return wikipediaPageAirPediaClassIndexName;
	}

	public String getWikipediaFirstNameFileName() {
		return wikipediaFirstNameFileName;
	}

	public String getWikipediaFirstNameIndexName() {
		return wikipediaFirstNameIndexName;
	}

	public String getWikipediaFileSourceName() {
		return wikipediaFileSourceName;
	}

	public String getWikipediaFileSourceIndexName() {
		return wikipediaFileSourceIndexName;
	}

	public String getCommonsFileName() {
		return commonsFileName;
	}

	public String getWikipediaFileName() {
		return wikipediaFileName;
	}

	public String getWikipediaTextIndexName() {
		return wikipediaTextIndexName;
	}

	public String getOneExamplePerSenseFileName() {
		return oneExamplePerSenseFileName;
	}

	public String getOneExamplePerSenseIndexName() {
		return oneExamplePerSenseIndexName;
	}

	public String getWikipediaSectionTitleFileName() {
		return wikipediaSectionTitleFileName;
	}

	public String getWikipediaUnigramFileName() {
		return wikipediaUnigramFileName;
	}

	public String getWikipediaSortedIncomingOutgoingFileName() {
		return wikipediaSortedIncomingOutgoingFileName;
	}

	public String getIncomingOutgoingLogFileName() {
		return incomingOutgoingLogFileName;
	}

	public String getWikipediaIncomingOutgoingFileName() {
		return wikipediaIncomingOutgoingFileName;
	}

	public String getWikipediaIncomingOutgoingIndexName() {
		return wikipediaIncomingOutgoingIndexName;
	}

	public String getWikipediaOutgoingIndexName() {
		return wikipediaOutgoingIndexName;
	}

	public String getWikipediaIncomingIndexName() {
		return wikipediaIncomingIndexName;
	}

	public String getWikipediaContentPageFileName() {
		return wikipediaContentPageFileName;
	}

	public String getOutgoingLogFileName() {
		return outgoingLogFileName;
	}


	public String getVectorLogFileName() {
		return vectorLogFileName;
	}

	public String getWikipediaVectorIndexName() {
		return wikipediaVectorIndexName;
	}

	public String getWikipediaCrossLanguageLinkIndexName() {
		return wikipediaCrossLanguageLinkIndexName;
	}

	public String getExtractionLogFileName() {
		return extractionLogFileName;
	}

	public String getPreprocessingLogFileName() {
		return preprocessingLogFileName;
	}

	public String getSortingLogFileName() {
		return sortingLogFileName;
	}

	public String getIndexingLogFileName() {
		return indexingLogFileName;
	}

	public String getWikipediaTypeIndexName() {
		return wikipediaTypeIndexName;
	}

	public String getWikipediaNGramIndexName() {
		return wikipediaNGramIndexName;
	}

	public String getWikipediaNGramFileName() {
		return wikipediaNGramFileName;
	}

	public String getWikipediaPageFormIndexName() {
		return wikipediaPageFormIndexName;
	}

	public void setWikipediaPageFormIndexName(String wikipediaPageFormIndexName) {
		this.wikipediaPageFormIndexName = wikipediaPageFormIndexName;
	}

	public String getWikipediaFormPageIndexName() {
		return wikipediaFormPageIndexName;
	}

	public void setWikipediaFormPageIndexName(String wikipediaFormPageIndexName) {
		this.wikipediaFormPageIndexName = wikipediaFormPageIndexName;
	}

	public String getWikipediaFilteredExampleFileName() {
		return wikipediaFilteredExampleFileName;
	}

	public String getWikipediaInfoboxFileName() {
		return wikipediaInfoboxFileName;
	}

	public String getWikipediaDBPediaFileName() {
		return wikipediaDBPediaFileName;
	}

	public String getVersion() {
		return version;
	}

	public void setVersion(String version) {
		this.version = version;
	}

	public Locale getLocale() {
		return locale;
	}

	public String getLang() {
		return lang;
	}


	public String getWikipediaDirName() {
		return wikipediaDirName;
	}


	public String getWikipediaRedirFileName() {
		return wikipediaRedirFileName;
	}

	public String getWikipediaDisambiguationFileName() {
		return wikipediaDisambiguationFileName;
	}

	public String getWikipediaAnalysisFileName() {
		return wikipediaAnalysisFileName;
	}

	public String getWikipediaTitleIdFileName() {
		return wikipediaTitleIdFileName;
	}

	public String getWikipediaPersonInfoFileName() {
		return wikipediaPersonInfoFileName;
	}

	public String getWikipediaOutgoingFileName() {
		return wikipediaOutgoingFileName;
	}

	public String getWikipediaPageCategoryFileName() {
		return wikipediaPageCategoryFileName;
	}

	public String getWikipediaCategorySuperCategoryFileName() {
		return wikipediaCategorySuperCategoryFileName;
	}

	public String getWikipediaCrossLanguageLinkFileName() {
		return wikipediaCrossLanguageLinkFileName;
	}

	public String getWikipediaTemplateFilePrefixName() {
		return wikipediaTemplateFilePrefixName;
	}

	public String getWikipediaExampleFileName() {
		return wikipediaExampleFileName;
	}

	public String getWikipediaSectionTitleFilePrefixName() {
		return wikipediaSectionTitleFileName;
	}

	public String getWikipediaSortedPageFileName() {
		return wikipediaSortedPageFileName;
	}

	public String getWikipediaSortedFormFileName() {
		return wikipediaSortedFormFileName;
	}

	public String getWikipediaIncomingFileName() {
		return wikipediaIncomingFileName;
	}

	public String getWikipediaFormFreqFileName() {
		return wikipediaFormFreqFileName;
	}

	public String getWikipediaPageFreqFileName() {
		return wikipediaPageFreqFileName;
	}

	public String getWikipediaTextFileName() {
		return wikipediaTextFileName;
	}

	public String getWikipediaVectorFileName() {
		return wikipediaVectorFileName;
	}

	public String getWikipediaXmlFileName() {
		return wikipediaXmlFileName;
	}

	public String getWikipediaSectionFileName() {
		return wikipediaSectionFileName;
	}

	public String getWikipediaPageSectionFreqFileName() {
		return wikipediaPageSectionFreqFileName;
	}

	@Override
	public String toString() {
		return "ExtractorParameters{" +
				"version='" + version + '\'' +
				", lang='" + lang + '\'' +
				", wikipediaXmlFileName='" + wikipediaXmlFileName + '\'' +
				", extractionOutputDirName='" + extractionOutputDirName + '\'' +
				", wikipediaDirName='" + wikipediaDirName + '\'' +
				", wikipediaOutputFilePrefixName='" + wikipediaOutputFilePrefixName + '\'' +
				", wikipediaRedirFileName='" + wikipediaRedirFileName + '\'' +
				", wikipediaDisambiguationFileName='" + wikipediaDisambiguationFileName + '\'' +
				", wikipediaAnalysisFileName='" + wikipediaAnalysisFileName + '\'' +
				", wikipediaTitleIdFileName='" + wikipediaTitleIdFileName + '\'' +
				", wikipediaPersonInfoFileName='" + wikipediaPersonInfoFileName + '\'' +
				", wikipediaOutgoingFileName='" + wikipediaOutgoingFileName + '\'' +
				", wikipediaPageCategoryPrefix='" + wikipediaPageCategoryPrefix + '\'' +
				", wikipediaPageCategoryFileName='" + wikipediaPageCategoryFileName + '\'' +
				", wikipediaCategoryFileNames=" + wikipediaCategoryFileNames +
				", wikipediaCategorySuperCategoryFileName='" + wikipediaCategorySuperCategoryFileName + '\'' +
				", wikipediaCrossLanguageLinkFileName='" + wikipediaCrossLanguageLinkFileName + '\'' +
				", wikipediaTemplateFilePrefixName='" + wikipediaTemplateFilePrefixName + '\'' +
				", wikipediaTemplateFileNames=" + wikipediaTemplateFileNames +
				", wikipediaInfoboxFileName='" + wikipediaInfoboxFileName + '\'' +
				", wikipediaExampleFileName='" + wikipediaExampleFileName + '\'' +
				", wikipediaSectionTitlePrefix='" + wikipediaSectionTitlePrefix + '\'' +
				", wikipediaSectionTitleFileName='" + wikipediaSectionTitleFileName + '\'' +
				", wikipediaSectionTitleFileNames=" + wikipediaSectionTitleFileNames +
				", wikipediaSortedPageFileName='" + wikipediaSortedPageFileName + '\'' +
				", wikipediaSortedFormFileName='" + wikipediaSortedFormFileName + '\'' +
				", wikipediaIncomingFileName='" + wikipediaIncomingFileName + '\'' +
				", wikipediaFormFreqFileName='" + wikipediaFormFreqFileName + '\'' +
				", wikipediaPageFreqFileName='" + wikipediaPageFreqFileName + '\'' +
				", wikipediaTextFileName='" + wikipediaTextFileName + '\'' +
				", wikipediaNGramFileName='" + wikipediaNGramFileName + '\'' +
				", wikipediaVectorFileName='" + wikipediaVectorFileName + '\'' +
				", wikipediaDBPediaFileName='" + wikipediaDBPediaFileName + '\'' +
				", wikipediaFilteredExampleFileName='" + wikipediaFilteredExampleFileName + '\'' +
				", wikipediaPageFormIndexName='" + wikipediaPageFormIndexName + '\'' +
				", wikipediaFormPageIndexName='" + wikipediaFormPageIndexName + '\'' +
				", wikipediaNGramIndexName='" + wikipediaNGramIndexName + '\'' +
				", wikipediaTypeIndexName='" + wikipediaTypeIndexName + '\'' +
				", wikipediaCrossLanguageLinkIndexName='" + wikipediaCrossLanguageLinkIndexName + '\'' +
				", extractionLogFileName='" + extractionLogFileName + '\'' +
				", preprocessingLogFileName='" + preprocessingLogFileName + '\'' +
				", sortingLogFileName='" + sortingLogFileName + '\'' +
				", indexingLogFileName='" + indexingLogFileName + '\'' +
				", wikipediaVectorIndexName='" + wikipediaVectorIndexName + '\'' +
				", vectorLogFileName='" + vectorLogFileName + '\'' +
				", outgoingLogFileName='" + outgoingLogFileName + '\'' +
				", wikipediaContentPageFileName='" + wikipediaContentPageFileName + '\'' +
				", wikipediaOutgoingIndexName='" + wikipediaOutgoingIndexName + '\'' +
				", wikipediaIncomingIndexName='" + wikipediaIncomingIndexName + '\'' +
				", wikipediaIncomingOutgoingFileName='" + wikipediaIncomingOutgoingFileName + '\'' +
				", wikipediaIncomingOutgoingIndexName='" + wikipediaIncomingOutgoingIndexName + '\'' +
				", incomingOutgoingLogFileName='" + incomingOutgoingLogFileName + '\'' +
				", wikipediaSortedIncomingOutgoingFileName='" + wikipediaSortedIncomingOutgoingFileName + '\'' +
				", wikipediaUnigramFileName='" + wikipediaUnigramFileName + '\'' +
				", oneExamplePerSenseFileName='" + oneExamplePerSenseFileName + '\'' +
				", oneExamplePerSenseIndexName='" + oneExamplePerSenseIndexName + '\'' +
				", wikipediaTextIndexName='" + wikipediaTextIndexName + '\'' +
				", locale=" + locale +
				'}';
	}

	public static void main(String args[]) throws Exception {
		String logConfig = System.getProperty("log-config");
		if (logConfig == null) {
			logConfig = "configuration/log-config.txt";
		}

		PropertyConfigurator.configure(logConfig);
		//java -cp dist/thewikimachine.jar eu.fbk.twm.utils.ExtractorParameters
		ExtractorParameters extractorParameters = new ExtractorParameters(args[0], args[1]);
		logger.debug(extractorParameters);


	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy