
eu.fbk.twm.wiki.xmldump.util.clean.CleanWikipedia Maven / Gradle / Ivy
package eu.fbk.twm.wiki.xmldump.util.clean;
import eu.fbk.twm.wiki.xmldump.util.WikiTemplate;
import java.io.BufferedReader;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* Created with IntelliJ IDEA.
* User: aprosio
* Date: 2/7/13
* Time: 6:15 PM
* To change this template use File | Settings | File Templates.
*/
public class CleanWikipedia {
public static String[] tagsToBeRemoved = {"math", "timeline"};
public static String removeTag(String text, String tagName) {
//todo: check for nested tags
ArrayList res = new ArrayList();
Pattern tag = Pattern.compile("(?" + tagName + ">)");
Matcher m = tag.matcher(text);
Integer lastStart = null;
while (m.find()) {
String found = m.group(1);
boolean isStart = true;
if (found.charAt(1) == '/') {
isStart = false;
}
if (isStart) {
lastStart = m.start();
}
if (!isStart && lastStart != null) {
Integer[] integers = {lastStart, m.end()};
res.add(integers);
lastStart = null;
}
}
int offset = 0;
StringBuffer sb = new StringBuffer(text);
for (Integer[] integers : res) {
Integer start = integers[0];
Integer end = integers[1];
sb.replace(start - offset, end - offset, "");
offset += end - start;
}
return sb.toString();
}
public static String clean(String text, String[] filePrefixes, boolean removeImages, boolean removeTables) {
return clean(text, filePrefixes, removeImages, removeTables, true);
}
public static String clean(String text, String[] filePrefixes, boolean removeImages, boolean removeTables, boolean removeBadHTML) {
if (removeBadHTML) {
for (String tag : tagsToBeRemoved) {
text = removeTag(text, tag);
}
}
StringBuilder sb = new StringBuilder(text);
if (removeImages) {
ArrayList listOfLinks = WikiLinkParser.getTemplates(sb.toString(), false);
int offset = 0;
for (WikiTemplate link : listOfLinks) {
if (link.getPartsCount() > 0 && link.isRoot) {
// link.getParts().get(0).startsWith(filePrefix) &&
for (String prefix : filePrefixes) {
if (link.getParts().get(0).startsWith(prefix)) {
int start = link.getStart();
int end = link.getEnd() + 1;
sb.replace(start - offset, end - offset, "");
offset += end - start;
break;
}
}
}
}
}
if (removeTables) {
ArrayList listOfTables = WikiTableParser.getTemplates(sb.toString(), false);
int offset = 0;
for (WikiTemplate table : listOfTables) {
if (table.isRoot) {
int start = table.getStart();
int end = table.getEnd() + 1;
sb.replace(start - offset, end - offset, "");
offset += end - start;
}
}
}
return sb.toString();
}
public static void main(String[] args) {
String filename = args[0];
StringBuilder strbuf = new StringBuilder();
// Read filePageCounter
try {
BufferedReader in = new BufferedReader(new FileReader(filename));
String line;
while ((line = in.readLine()) != null) {
strbuf.append(line).append("\n");
}
in.close();
} catch (Exception e) {
e.printStackTrace();
}
String text = strbuf.toString();
String[] prefixes = {"Image:", "File:", "Soubor:"};
text = clean(text, prefixes, true, true);
System.out.println(text);
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy