org.culturegraph.mf.mediawiki.analyzers.SimpleLinkExtractor Maven / Gradle / Ivy
/*
* Copyright 2013 Deutsche Nationalbibliothek
*
* Licensed under the Apache License, Version 2.0 the "License";
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.culturegraph.mf.mediawiki.analyzers;
import org.culturegraph.mf.framework.StreamReceiver;
import org.culturegraph.mf.framework.annotations.Description;
import org.culturegraph.mf.framework.annotations.In;
import org.culturegraph.mf.framework.annotations.Out;
import org.culturegraph.mf.framework.helpers.DefaultObjectPipe;
import org.culturegraph.mf.mediawiki.WikiTextParser.ParseLevel;
import org.culturegraph.mf.mediawiki.objects.WikiPage;
/**
*
* very simple but efficient link extraction for wikitext.
*
* @author Markus Michael Geipel
*
*/
@Description("Very simple but efficient link extraction for wikitext.")
@In(WikiPage.class)
@Out(StreamReceiver.class)
public final class SimpleLinkExtractor
extends DefaultObjectPipe
implements Analyzer {
@Override
public boolean wikiTextOnly() {
return true;
}
@Override
public ParseLevel requiredParseLevel() {
return null;
}
@Override
public void process(final WikiPage page) {
//getReceiver().startRecord(Long.toString(page.getPageId()));
final String wikiText = page.getWikiText();
int start = 0;
while (true) {
start = wikiText.indexOf("[[", start);
if (start < 0) {
break;
}
final int end = wikiText.indexOf("]]", start);
if (end < 0) {
break;
}
String linkText = wikiText.substring(start + 2, end);
// skip empty links
if (linkText.length() == 0) {
start = end + 1;
continue;
}
// skip special links
//if (excludeSpecialLinks && linkText.indexOf(':') != -1) {
// start = end + 1;
// continue;
//}
// if there is anchor text, get only article title
int tmp;
tmp = linkText.indexOf('|');
if (tmp != -1) {
linkText = linkText.substring(0, tmp);
}
tmp = linkText.indexOf('#');
if (tmp != -1) {
linkText = linkText.substring(0, tmp);
}
// ignore article-internal links, e.g., [[#section|here]]
if (linkText.length() == 0) {
start = end + 1;
continue;
}
getReceiver().literal("ref", linkText.trim());
start = end + 1;
}
//getReceiver().endRecord();
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy