org.apache.lucene.benchmark.byTask.feeds.EnwikiContentSource Maven / Gradle / Ivy
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.benchmark.byTask.feeds;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.benchmark.byTask.utils.StreamUtils;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.ThreadInterruptedException;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;
/**
* A {@link ContentSource} which reads the English Wikipedia dump. You can read
* the .bz2 file directly (it will be decompressed on the fly). Config
* properties:
*
* - keep.image.only.docs=false|true (default true).
*
- docs.file=<path to the file>
*
*/
public class EnwikiContentSource extends ContentSource {
private class Parser extends DefaultHandler implements Runnable {
private Thread t;
private boolean threadDone;
private boolean stopped = false;
private String[] tuple;
private NoMoreDataException nmde;
private StringBuilder contents = new StringBuilder();
private String title;
private String body;
private String time;
private String id;
String[] next() throws NoMoreDataException {
if (t == null) {
threadDone = false;
t = new Thread(this);
t.setDaemon(true);
t.start();
}
String[] result;
synchronized(this){
while(tuple == null && nmde == null && !threadDone && !stopped) {
try {
wait();
} catch (InterruptedException ie) {
throw new ThreadInterruptedException(ie);
}
}
if (tuple != null) {
result = tuple;
tuple = null;
notify();
return result;
}
if (nmde != null) {
// Set to null so we will re-start thread in case
// we are re-used:
t = null;
throw nmde;
}
// The thread has exited yet did not hit end of
// data, so this means it hit an exception. We
// throw NoMorDataException here to force
// benchmark to stop the current alg:
throw new NoMoreDataException();
}
}
String time(String original) {
StringBuilder buffer = new StringBuilder();
buffer.append(original.substring(8, 10));
buffer.append('-');
buffer.append(months[Integer.valueOf(original.substring(5, 7)).intValue() - 1]);
buffer.append('-');
buffer.append(original.substring(0, 4));
buffer.append(' ');
buffer.append(original.substring(11, 19));
buffer.append(".000");
return buffer.toString();
}
@Override
public void characters(char[] ch, int start, int length) {
contents.append(ch, start, length);
}
@Override
public void endElement(String namespace, String simple, String qualified)
throws SAXException {
int elemType = getElementType(qualified);
switch (elemType) {
case PAGE:
// the body must be null and we either are keeping image docs or the
// title does not start with Image:
if (body != null && (keepImages || !title.startsWith("Image:"))) {
String[] tmpTuple = new String[LENGTH];
tmpTuple[TITLE] = title.replace('\t', ' ');
tmpTuple[DATE] = time.replace('\t', ' ');
tmpTuple[BODY] = body.replaceAll("[\t\n]", " ");
tmpTuple[ID] = id;
synchronized(this) {
while (tuple != null && !stopped) {
try {
wait();
} catch (InterruptedException ie) {
throw new ThreadInterruptedException(ie);
}
}
tuple = tmpTuple;
notify();
}
}
break;
case BODY:
body = contents.toString();
//workaround that startswith doesn't have an ignore case option, get at least 20 chars.
String startsWith = body.substring(0, Math.min(10, contents.length())).toLowerCase(Locale.ROOT);
if (startsWith.startsWith("#redirect")) {
body = null;
}
break;
case DATE:
time = time(contents.toString());
break;
case TITLE:
title = contents.toString();
break;
case ID:
//the doc id is the first one in the page. All other ids after that one can be ignored according to the schema
if (id == null) {
id = contents.toString();
}
break;
default:
// this element should be discarded.
}
}
@Override
public void run() {
try {
XMLReader reader = XMLReaderFactory.createXMLReader();
reader.setContentHandler(this);
reader.setErrorHandler(this);
while(!stopped){
final InputStream localFileIS = is;
if (localFileIS != null) { // null means fileIS was closed on us
try {
// To work around a bug in XERCES (XERCESJ-1257), we assume the XML is always UTF8, so we simply provide reader.
reader.parse(new InputSource(IOUtils.getDecodingReader(localFileIS, StandardCharsets.UTF_8)));
} catch (IOException ioe) {
synchronized(EnwikiContentSource.this) {
if (localFileIS != is) {
// fileIS was closed on us, so, just fall through
} else
// Exception is real
throw ioe;
}
}
}
synchronized(this) {
if (stopped || !forever) {
nmde = new NoMoreDataException();
notify();
return;
} else if (localFileIS == is) {
// If file is not already re-opened then re-open it now
is = openInputStream();
}
}
}
} catch (SAXException | IOException sae) {
throw new RuntimeException(sae);
} finally {
synchronized(this) {
threadDone = true;
notify();
}
}
}
@Override
public void startElement(String namespace, String simple, String qualified,
Attributes attributes) {
int elemType = getElementType(qualified);
switch (elemType) {
case PAGE:
title = null;
body = null;
time = null;
id = null;
break;
// intentional fall-through.
case BODY:
case DATE:
case TITLE:
case ID:
contents.setLength(0);
break;
default:
// this element should be discarded.
}
}
private void stop() {
synchronized (this) {
stopped = true;
if (tuple != null) {
tuple = null;
notify();
}
}
}
}
private static final Map ELEMENTS = new HashMap<>();
private static final int TITLE = 0;
private static final int DATE = TITLE + 1;
private static final int BODY = DATE + 1;
private static final int ID = BODY + 1;
private static final int LENGTH = ID + 1;
// LENGTH is used as the size of the tuple, so whatever constants we need that
// should not be part of the tuple, we should define them after LENGTH.
private static final int PAGE = LENGTH + 1;
private static final String[] months = {"JAN", "FEB", "MAR", "APR",
"MAY", "JUN", "JUL", "AUG",
"SEP", "OCT", "NOV", "DEC"};
static {
ELEMENTS.put("page", Integer.valueOf(PAGE));
ELEMENTS.put("text", Integer.valueOf(BODY));
ELEMENTS.put("timestamp", Integer.valueOf(DATE));
ELEMENTS.put("title", Integer.valueOf(TITLE));
ELEMENTS.put("id", Integer.valueOf(ID));
}
/**
* Returns the type of the element if defined, otherwise returns -1. This
* method is useful in startElement and endElement, by not needing to compare
* the element qualified name over and over.
*/
private final static int getElementType(String elem) {
Integer val = ELEMENTS.get(elem);
return val == null ? -1 : val.intValue();
}
private Path file;
private boolean keepImages = true;
private InputStream is;
private Parser parser = new Parser();
@Override
public void close() throws IOException {
synchronized (EnwikiContentSource.this) {
parser.stop();
if (is != null) {
is.close();
is = null;
}
}
}
@Override
public synchronized DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException {
String[] tuple = parser.next();
docData.clear();
docData.setName(tuple[ID]);
docData.setBody(tuple[BODY]);
docData.setDate(tuple[DATE]);
docData.setTitle(tuple[TITLE]);
return docData;
}
@Override
public void resetInputs() throws IOException {
super.resetInputs();
is = openInputStream();
}
/** Open the input stream. */
protected InputStream openInputStream() throws IOException {
return StreamUtils.inputStream(file);
}
@Override
public void setConfig(Config config) {
super.setConfig(config);
keepImages = config.get("keep.image.only.docs", true);
String fileName = config.get("docs.file", null);
if (fileName != null) {
file = Paths.get(fileName).toAbsolutePath();
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy