All Downloads are FREE. Search and download functionalities are using the official Maven repository.

de.datexis.retrieval.preprocess.WikipediaUrlPreprocessor Maven / Gradle / Ivy

package de.datexis.retrieval.preprocess;

import org.apache.commons.lang.StringEscapeUtils;
import org.deeplearning4j.text.tokenization.tokenizer.TokenPreProcess;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;

/**
 * @author Sebastian Arnold 
 */
public class WikipediaUrlPreprocessor implements TokenPreProcess {
  
  protected final Logger log = LoggerFactory.getLogger(getClass());
  
  public WikipediaUrlPreprocessor() {
  }
  
  @Override
  public String preProcess(String pageId) {
    return cleanWikiPageTitle(pageId);
  }
  
  /**
   * Clean a given URL or page title to get a valid Wiki page title
   * see https://en.wikipedia.org/wiki/Wikipedia:Page_name#Technical_restrictions_and_limitations
   */
  public static String cleanWikiPageTitle(String pageTitle) {
    // strip URL prefix
    pageTitle = pageTitle.replaceFirst("^.+\\/wiki\\/", "").replaceFirst("#.+$", ""); // remove host path and anchors
    pageTitle = pageTitle.replaceAll("%(?![0-9A-F][0-9A-F])","%25"); // replace all '%' without following hex number by %25
    try {
      pageTitle = URLDecoder.decode(pageTitle, "UTF-8"); // replace escaped chars
    } catch(UnsupportedEncodingException | IllegalArgumentException e) {
    }
    
    pageTitle = StringEscapeUtils.unescapeHtml(pageTitle);
    return pageTitle.replace(" ","_").trim(); // replace spaces
  }
  
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy