de.julielab.genemapper.resources.uima.WikipediaReader Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of gene-mapper-resources Show documentation
Show all versions of gene-mapper-resources Show documentation
This project assembles code and files required to build the dictionaries and indexes used by the JCoRe
Gene Mapper.
The newest version!
package de.julielab.genemapper.resources.uima;
import de.julielab.genemapper.WikipediaCategoryManager;
import de.julielab.genemapper.resources.MultiStreamBZip2InputStream;
import de.julielab.java.utilities.FileUtilities;
import de.julielab.jcore.types.Header;
import de.julielab.jcore.types.wikipedia.Title;
import org.apache.commons.io.LineIterator;
import org.apache.uima.UimaContext;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.fit.component.JCasCollectionReader_ImplBase;
import org.apache.uima.fit.descriptor.ConfigurationParameter;
import org.apache.uima.fit.descriptor.ResourceMetaData;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.Progress;
import org.apache.uima.util.ProgressImpl;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import javax.annotation.Nullable;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamReader;
import java.io.*;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.function.Predicate;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
@ResourceMetaData(name = "JCoRe GeneMapper Wikipedia Reader", description = "Reads the XML Wikipedia dump. Extracts a portion of the first text line of each page to capture the term definition.")
public class WikipediaReader extends JCasCollectionReader_ImplBase {
public static final String PARAM_WIKIPEDIA_XML = "WikipediaXML";
public static final String PARAM_EXCERPT_LENGTH = "ExcerptLength";
public static final String PARAM_TITLE_WHITELIST = "TitleWhitelist";
public static final String PARAM_WIKIPEDIA_CATEGORY_TREE_PATH = "WikipediaCategoryTreePath";
private final static Logger log = LoggerFactory.getLogger(WikipediaReader.class);
private static final Pattern WIKI_LINK_PATTERN = Pattern.compile("\\[\\[[^]|]+\\|([^]]+)\\]\\]");
private static final Pattern WIKI_MARKUP_ELEMENTS = Pattern.compile("[]\\[{}']+");
/**
* Removes ... elements from the text, i.e. bibliographical references.
*/
private static final Pattern XML_REF_ELEMENT_PATTERN = Pattern.compile("");
private static final Pattern XML_MARKUP_ELEMENTS = Pattern.compile("<[^>]+>");
private static final Pattern NON_WS_PATTERN = Pattern.compile("[^\\s]");
private static final Set NON_TEXT_CHARS = Set.of('{', '}', '#', '|', '<', '[', '*');
private static WikipediaCategoryManager wikipediaCategoryManager;
private final XMLInputFactory factory = XMLInputFactory.newInstance();
@ConfigurationParameter(name = PARAM_WIKIPEDIA_XML)
private String wikipediaXml;
@ConfigurationParameter(name = PARAM_EXCERPT_LENGTH, description = "Maximum number of characters to be kept from the first line of each page. Defaults to 1000.", mandatory = false, defaultValue = "1000")
private int excerptLength;
@ConfigurationParameter(name = PARAM_TITLE_WHITELIST, description = "Path to a file. If given, only pages that have a title on the list will be returned as a CAS.", mandatory = false)
private String titleWhiteListFilePath;
@ConfigurationParameter(name = PARAM_WIKIPEDIA_CATEGORY_TREE_PATH, mandatory = false, description = "Optional. File created by GeNo's 'WikipediaCategoryTreeAndRedirectsExtractor' class that represents a map from page and category titles to categories they belong to. Will be used to filter for pages that are in some way related to the Molecular Biology category.")
private String wikipediaCategoryTreePath;
private Set titleWhitelist;
private XMLStreamReader parser;
private ParsingStatus currentPage;
private int processedPages;
@Override
public void initialize(UimaContext context) throws ResourceInitializationException {
super.initialize(context);
try {
this.wikipediaXml = (String) context.getConfigParameterValue(PARAM_WIKIPEDIA_XML);
this.excerptLength = Optional.ofNullable((Integer) context.getConfigParameterValue(PARAM_EXCERPT_LENGTH)).orElse(1000);
this.titleWhiteListFilePath = (String) context.getConfigParameterValue(PARAM_TITLE_WHITELIST);
this.wikipediaCategoryTreePath = (String) context.getConfigParameterValue(PARAM_WIKIPEDIA_CATEGORY_TREE_PATH);
log.info("Reading Wikipedia dump from {}.", wikipediaXml);
log.info("Maximum excerpt length: {}", excerptLength);
if (titleWhiteListFilePath != null) {
try (BufferedReader br = FileUtilities.getReaderFromFile(new File(titleWhiteListFilePath))) {
titleWhitelist = br.lines().filter(Predicate.not(line -> line.startsWith("#") || line.isBlank())).collect(Collectors.toSet());
log.info("Received Wikipedia title whitelist from {} with {} entries.", titleWhiteListFilePath, titleWhitelist.size());
}
}
FileInputStream fin = new FileInputStream(wikipediaXml);
BufferedInputStream bis = new BufferedInputStream(fin);
InputStream bis2 = wikipediaXml.endsWith(".bz2") ? new MultiStreamBZip2InputStream(bis) : bis;
BufferedReader br = new BufferedReader(new InputStreamReader(bis2));
parser = factory.createXMLStreamReader(br);
currentPage = getNextPage();
processedPages = 0;
} catch (IOException | XMLStreamException e) {
log.error("Exception while initializing WikipediaReader", e);
throw new ResourceInitializationException(e);
}
synchronized (WikipediaReader.class) {
if (wikipediaCategoryTreePath != null && wikipediaCategoryManager == null) {
log.info("Creating Dijkstra tree for Category:Biology");
wikipediaCategoryManager = new WikipediaCategoryManager(wikipediaCategoryTreePath, true);
wikipediaCategoryManager.buildDijkstraTree("Category:Biology");
}
}
}
@Override
public void getNext(JCas jCas) throws CollectionException {
try {
if (currentPage != null) {
String lcText = currentPage.getText().toLowerCase();
String lcTitle = currentPage.getTitle().toLowerCase();
int i = lcText.indexOf(lcTitle);
int l = lcTitle.length();
if (i < 0 && lcTitle.endsWith("s")) {
i = lcText.indexOf(lcTitle.substring(0, lcTitle.length() - 1));
l = lcTitle.length() - 1;
}
Title title = null;
if (i >= 0) {
title = new Title(jCas, i, i + l);
title.addToIndexes();
}
int headerBegin = title != null ? title.getBegin() : 0;
int headerEnd = title != null ? title.getEnd() : lcText.indexOf(" ");
Header header = headerBegin >= 0 && headerEnd > headerBegin ? new Header(jCas, headerBegin, headerEnd) : new Header(jCas);
header.setDocId(currentPage.getPageId());
header.setTitle(currentPage.getTitle());
header.addToIndexes();
jCas.setDocumentText(currentPage.getText());
currentPage = getNextPage();
++processedPages;
if (processedPages % 100000 == 0)
log.info("Processed {} pages.", processedPages);
}
} catch (Throwable e) {
log.error("Error while reading Wikipedia", e);
throw new CollectionException(e);
}
}
@Nullable
private ParsingStatus getNextPage() throws XMLStreamException {
ParsingStatus ps = null;
boolean pageParsed = false;
while (parser.hasNext() && (!pageParsed || (currentPage != null && currentPage.isSkip()))) {
int eventType = parser.next();
if (eventType == XMLStreamReader.START_ELEMENT) {
// page, title, text (<- Wikimedia Markup; look for first line not starting with curly braces, I guess)
if (parser.getLocalName().equalsIgnoreCase("page"))
ps = new ParsingStatus();
if (ps != null && !ps.isSkip()) {
if (parser.getLocalName().equalsIgnoreCase("title")) {
String pageTitle = parser.getElementText();
if (wikipediaCategoryManager != null) {
List path = wikipediaCategoryManager.getShortestPathToDijkstraTreeRoot(pageTitle, null);
// Check if this page is associated with Molecular Biology in any way
if (path.isEmpty())
ps.skip();
}
ps.setTitle(pageTitle);
} else if (parser.getLocalName().equalsIgnoreCase("ns"))
ps.setNamespace(parser.getElementText());
else if (parser.getLocalName().equalsIgnoreCase("text") && ps.getNamespace().equals("0")) {
if (titleWhitelist == null || titleWhitelist.isEmpty() || titleWhitelist.contains(ps.getTitle())) {
parseText(parser.getElementText(), ps);
}
} else if (ps != null && ps.getPageId() == null && parser.getLocalName().equalsIgnoreCase("id"))
ps.setPageId(parser.getElementText());
}
} else if (eventType == XMLStreamReader.END_ELEMENT) {
if (parser.getLocalName().equalsIgnoreCase("page")) {
if (ps != null) {
pageParsed = true;
if (ps.getText() == null || ps.getText().isBlank()) {
pageParsed = false;
ps = null;
}
}
}
}
}
return ps;
}
private void parseText(String elementText, ParsingStatus ps) {
// text (<- Wikimedia Markup; look for first line not starting with curly braces, I guess)
LineIterator lineIt = new LineIterator(new StringReader(elementText));
while (lineIt.hasNext()) {
String line = lineIt.next();
Matcher nonWsMatcher = NON_WS_PATTERN.matcher(line);
if (nonWsMatcher.find()) {
if (!line.isBlank() && !NON_TEXT_CHARS.contains(line.charAt(nonWsMatcher.start()))) {
String textWithoutLinks = WIKI_LINK_PATTERN.matcher(line).replaceAll("$1");
String textWithoutWikiMarkup = WIKI_MARKUP_ELEMENTS.matcher(textWithoutLinks).replaceAll("");
String textWithoutXmlRefElements = XML_REF_ELEMENT_PATTERN.matcher(textWithoutWikiMarkup).replaceAll("");
String textWithoutXmlMarkup = XML_MARKUP_ELEMENTS.matcher(textWithoutXmlRefElements).replaceAll("");
ps.setText(textWithoutXmlMarkup.substring(0, Math.min(textWithoutXmlMarkup.length(), excerptLength)));
break;
}
}
}
}
@Override
public boolean hasNext() throws IOException, CollectionException {
return currentPage != null;
}
@Override
public Progress[] getProgress() {
return new Progress[]{new ProgressImpl(processedPages, 0, "pages")};
}
private class ParsingStatus {
private String title;
private String text;
private String namespace;
private String pageId;
private boolean skip;
public boolean isSkip() {
return skip;
}
public String getText() {
return text;
}
public void setText(String text) {
this.text = text;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getNamespace() {
return namespace;
}
public void setNamespace(String namespace) {
this.namespace = namespace;
}
public String getPageId() {
return pageId;
}
public void setPageId(String pageId) {
this.pageId = pageId;
}
public void skip() {
this.skip = true;
}
}
}