All Downloads are FREE. Search and download functionalities are using the official Maven repository.

eu.fbk.twm.wiki.xmldump.util.clean.CleanWikipedia Maven / Gradle / Ivy

package eu.fbk.twm.wiki.xmldump.util.clean;

import eu.fbk.twm.wiki.xmldump.util.WikiTemplate;

import java.io.BufferedReader;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Created with IntelliJ IDEA.
 * User: aprosio
 * Date: 2/7/13
 * Time: 6:15 PM
 * To change this template use File | Settings | File Templates.
 */
public class CleanWikipedia {

	public static String[] tagsToBeRemoved = {"math", "timeline"};

	public static String removeTag(String text, String tagName) {

		//todo: check for nested tags

		ArrayList res = new ArrayList();
		Pattern tag = Pattern.compile("()");
		Matcher m = tag.matcher(text);
		Integer lastStart = null;
		while (m.find()) {
			String found = m.group(1);
			boolean isStart = true;
			if (found.charAt(1) == '/') {
				isStart = false;
			}

			if (isStart) {
				lastStart = m.start();
			}
			if (!isStart && lastStart != null) {
				Integer[] integers = {lastStart, m.end()};
				res.add(integers);
				lastStart = null;
			}
		}

		int offset = 0;
		StringBuffer sb = new StringBuffer(text);
		for (Integer[] integers : res) {
			Integer start = integers[0];
			Integer end = integers[1];
			sb.replace(start - offset, end - offset, "");
			offset += end - start;
		}

		return sb.toString();
	}

	public static String clean(String text, String[] filePrefixes, boolean removeImages, boolean removeTables) {
		return clean(text, filePrefixes, removeImages, removeTables, true);
	}

	public static String clean(String text, String[] filePrefixes, boolean removeImages, boolean removeTables, boolean removeBadHTML) {

		if (removeBadHTML) {
			for (String tag : tagsToBeRemoved) {
				text = removeTag(text, tag);
			}
		}

		StringBuilder sb = new StringBuilder(text);

		if (removeImages) {
			ArrayList listOfLinks = WikiLinkParser.getTemplates(sb.toString(), false);
			int offset = 0;
			for (WikiTemplate link : listOfLinks) {
				if (link.getPartsCount() > 0 && link.isRoot) {
					// link.getParts().get(0).startsWith(filePrefix) &&
					for (String prefix : filePrefixes) {
						if (link.getParts().get(0).startsWith(prefix)) {
							int start = link.getStart();
							int end = link.getEnd() + 1;
							sb.replace(start - offset, end - offset, "");
							offset += end - start;
							break;
						}
					}
				}
			}
		}

		if (removeTables) {
			ArrayList listOfTables = WikiTableParser.getTemplates(sb.toString(), false);
			int offset = 0;
			for (WikiTemplate table : listOfTables) {
				if (table.isRoot) {
					int start = table.getStart();
					int end = table.getEnd() + 1;
					sb.replace(start - offset, end - offset, "");
					offset += end - start;
				}
			}
		}

		return sb.toString();
	}

	public static void main(String[] args) {
		String filename = args[0];

		StringBuilder strbuf = new StringBuilder();

		// Read filePageCounter
		try {
			BufferedReader in = new BufferedReader(new FileReader(filename));
			String line;
			while ((line = in.readLine()) != null) {
				strbuf.append(line).append("\n");
			}
			in.close();
		} catch (Exception e) {
			e.printStackTrace();
		}

		String text = strbuf.toString();
		String[] prefixes = {"Image:", "File:", "Soubor:"};
		text = clean(text, prefixes, true, true);

		System.out.println(text);
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy