All Downloads are FREE. Search and download functionalities are using the official Maven repository.

eu.fbk.twm.wiki.xmldump.WikipediaPagePortalExtractor Maven / Gradle / Ivy

/*
 * Copyright (2013) Fondazione Bruno Kessler (http://www.fbk.eu/)
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package eu.fbk.twm.wiki.xmldump;

import eu.fbk.twm.utils.CharacterTable;
import eu.fbk.twm.utils.ExtractorParameters;
import eu.fbk.twm.utils.WikipediaExtractor;
import eu.fbk.twm.wiki.xmldump.util.WikiTemplate;
import eu.fbk.twm.wiki.xmldump.util.WikiTemplateParser;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;

import java.io.*;
import java.util.ArrayList;
import java.util.Locale;
import java.util.Set;

/**
 * User: aprosio
 * This class extracts the list of pairs page/portal
 */
public class WikipediaPagePortalExtractor extends AbstractWikipediaExtractor implements WikipediaExtractor {
	/**
	 * Define a static logger variable so that it references the
	 * Logger instance named WikipediaTemplateExtractor.
	 */
	static Logger logger = Logger.getLogger(WikipediaPagePortalExtractor.class.getName());

	private PrintWriter templatePortalWriter;

	public WikipediaPagePortalExtractor(int numThreads, int numPages, Locale locale) {
		super(numThreads, numPages, locale);
	}

	@Override
	public void start(ExtractorParameters extractorParameters) {
		// String prefix = extractorParameters.getWikipediaTemplateFilePrefixName();
		try {

			templatePortalWriter = new PrintWriter(new BufferedWriter(new OutputStreamWriter(new FileOutputStream(extractorParameters.getWikipediaPagePortalFileName()), "UTF-8")));

		} catch (IOException e) {
			logger.error(e);
		}
		if (resources.getString("PORTAL_LABEL") != null) {
			startProcess(extractorParameters.getWikipediaXmlFileName());
		}
		else {
			endProcess();
		}
	}

	@Override
	public void filePage(String text, String title, int wikiID) {
		//To change body of implemented methods use File | Settings | File Templates.
	}

	@Override
	public void disambiguationPage(String text, String title, int wikiID) {
		//To change body of implemented methods use File | Settings | File Templates.
	}

	@Override
	public void categoryPage(String text, String title, int wikiID) {
		//To change body of implemented methods use File | Settings | File Templates.
	}

	@Override
	public void redirectPage(String text, String title, int wikiID) {
		//To change body of implemented methods use File | Settings | File Templates.
	}

	@Override
	public void portalPage(String text, String title, int wikiID) {
		//To change body of implemented methods use File | Settings | File Templates.
	}

	@Override
	public void projectPage(String text, String title, int wikiID) {
		//To change body of implemented methods use File | Settings | File Templates.
	}


	@Override
	public void templatePage(String text, String title, int wikiID) {
//		synchronized (this) {
//			templateNameWriter.println(title);
//		}
	}

	@Override
	public void contentPage(String text, String title, int wikiID) {
		ArrayList listOfTemplates = WikiTemplateParser.parse(text, false);

		for (WikiTemplate t : listOfTemplates) {

			String firstPart = t.getFirstPart();
			if (firstPart == null) {
				continue;
			}
			if (firstPart.toLowerCase().equals(resources.getString("PORTAL_LABEL").toLowerCase())) {
				Set parts = t.getHashMapOfParts().keySet();
				for (String p : parts) {
					if (p.toLowerCase().equals(resources.getString("PORTAL_LABEL").toLowerCase())) {
						continue;
					}

					StringBuilder buff = new StringBuilder();
					buff.append(title);
					buff.append(CharacterTable.HORIZONTAL_TABULATION);
					buff.append(normalizePageName(p));
					buff.append(CharacterTable.LINE_FEED);

					synchronized (this) {
						templatePortalWriter.append(buff);
					}
				}
			}
		}
	}

	@Override
	public void endProcess() {
		super.endProcess();
		templatePortalWriter.flush();
		templatePortalWriter.close();
	}

	public static void main(String[] args) {
		String xmlFileName = args[0];
		String baseDir = args[1];

		String configurationFolder = "configuration/";
		String logConfig = System.getProperty("log-config");
		if (logConfig == null) {
			logConfig = configurationFolder + "log-config.txt";
		}

		PropertyConfigurator.configure(logConfig);

		ExtractorParameters extractorParameters = new ExtractorParameters(xmlFileName, baseDir, true);
		WikipediaPagePortalExtractor w = new WikipediaPagePortalExtractor(12, Integer.MAX_VALUE, extractorParameters.getLocale());
		w.start(extractorParameters);
	}

}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy