All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.benchmark.byTask.feeds.EnwikiContentSource Maven / Gradle / Ivy

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.benchmark.byTask.feeds;


import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;

import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.benchmark.byTask.utils.StreamUtils;
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.ThreadInterruptedException;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;

/**
 * A {@link ContentSource} which reads the English Wikipedia dump. You can read
 * the .bz2 file directly (it will be decompressed on the fly). Config
 * properties:
 * 
    *
  • keep.image.only.docs=false|true (default true). *
  • docs.file=<path to the file> *
*/ public class EnwikiContentSource extends ContentSource { private class Parser extends DefaultHandler implements Runnable { private Thread t; private boolean threadDone; private boolean stopped = false; private String[] tuple; private NoMoreDataException nmde; private StringBuilder contents = new StringBuilder(); private String title; private String body; private String time; private String id; String[] next() throws NoMoreDataException { if (t == null) { threadDone = false; t = new Thread(this); t.setDaemon(true); t.start(); } String[] result; synchronized(this){ while(tuple == null && nmde == null && !threadDone && !stopped) { try { wait(); } catch (InterruptedException ie) { throw new ThreadInterruptedException(ie); } } if (tuple != null) { result = tuple; tuple = null; notify(); return result; } if (nmde != null) { // Set to null so we will re-start thread in case // we are re-used: t = null; throw nmde; } // The thread has exited yet did not hit end of // data, so this means it hit an exception. We // throw NoMorDataException here to force // benchmark to stop the current alg: throw new NoMoreDataException(); } } String time(String original) { StringBuilder buffer = new StringBuilder(); buffer.append(original.substring(8, 10)); buffer.append('-'); buffer.append(months[Integer.valueOf(original.substring(5, 7)).intValue() - 1]); buffer.append('-'); buffer.append(original.substring(0, 4)); buffer.append(' '); buffer.append(original.substring(11, 19)); buffer.append(".000"); return buffer.toString(); } @Override public void characters(char[] ch, int start, int length) { contents.append(ch, start, length); } @Override public void endElement(String namespace, String simple, String qualified) throws SAXException { int elemType = getElementType(qualified); switch (elemType) { case PAGE: // the body must be null and we either are keeping image docs or the // title does not start with Image: if (body != null && (keepImages || !title.startsWith("Image:"))) { String[] tmpTuple = new String[LENGTH]; tmpTuple[TITLE] = title.replace('\t', ' '); tmpTuple[DATE] = time.replace('\t', ' '); tmpTuple[BODY] = body.replaceAll("[\t\n]", " "); tmpTuple[ID] = id; synchronized(this) { while (tuple != null && !stopped) { try { wait(); } catch (InterruptedException ie) { throw new ThreadInterruptedException(ie); } } tuple = tmpTuple; notify(); } } break; case BODY: body = contents.toString(); //workaround that startswith doesn't have an ignore case option, get at least 20 chars. String startsWith = body.substring(0, Math.min(10, contents.length())).toLowerCase(Locale.ROOT); if (startsWith.startsWith("#redirect")) { body = null; } break; case DATE: time = time(contents.toString()); break; case TITLE: title = contents.toString(); break; case ID: //the doc id is the first one in the page. All other ids after that one can be ignored according to the schema if (id == null) { id = contents.toString(); } break; default: // this element should be discarded. } } @Override public void run() { try { XMLReader reader = XMLReaderFactory.createXMLReader(); reader.setContentHandler(this); reader.setErrorHandler(this); while(!stopped){ final InputStream localFileIS = is; if (localFileIS != null) { // null means fileIS was closed on us try { // To work around a bug in XERCES (XERCESJ-1257), we assume the XML is always UTF8, so we simply provide reader. reader.parse(new InputSource(IOUtils.getDecodingReader(localFileIS, StandardCharsets.UTF_8))); } catch (IOException ioe) { synchronized(EnwikiContentSource.this) { if (localFileIS != is) { // fileIS was closed on us, so, just fall through } else // Exception is real throw ioe; } } } synchronized(this) { if (stopped || !forever) { nmde = new NoMoreDataException(); notify(); return; } else if (localFileIS == is) { // If file is not already re-opened then re-open it now is = openInputStream(); } } } } catch (SAXException | IOException sae) { throw new RuntimeException(sae); } finally { synchronized(this) { threadDone = true; notify(); } } } @Override public void startElement(String namespace, String simple, String qualified, Attributes attributes) { int elemType = getElementType(qualified); switch (elemType) { case PAGE: title = null; body = null; time = null; id = null; break; // intentional fall-through. case BODY: case DATE: case TITLE: case ID: contents.setLength(0); break; default: // this element should be discarded. } } private void stop() { synchronized (this) { stopped = true; if (tuple != null) { tuple = null; notify(); } } } } private static final Map ELEMENTS = new HashMap<>(); private static final int TITLE = 0; private static final int DATE = TITLE + 1; private static final int BODY = DATE + 1; private static final int ID = BODY + 1; private static final int LENGTH = ID + 1; // LENGTH is used as the size of the tuple, so whatever constants we need that // should not be part of the tuple, we should define them after LENGTH. private static final int PAGE = LENGTH + 1; private static final String[] months = {"JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"}; static { ELEMENTS.put("page", Integer.valueOf(PAGE)); ELEMENTS.put("text", Integer.valueOf(BODY)); ELEMENTS.put("timestamp", Integer.valueOf(DATE)); ELEMENTS.put("title", Integer.valueOf(TITLE)); ELEMENTS.put("id", Integer.valueOf(ID)); } /** * Returns the type of the element if defined, otherwise returns -1. This * method is useful in startElement and endElement, by not needing to compare * the element qualified name over and over. */ private final static int getElementType(String elem) { Integer val = ELEMENTS.get(elem); return val == null ? -1 : val.intValue(); } private Path file; private boolean keepImages = true; private InputStream is; private Parser parser = new Parser(); @Override public void close() throws IOException { synchronized (EnwikiContentSource.this) { parser.stop(); if (is != null) { is.close(); is = null; } } } @Override public synchronized DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException { String[] tuple = parser.next(); docData.clear(); docData.setName(tuple[ID]); docData.setBody(tuple[BODY]); docData.setDate(tuple[DATE]); docData.setTitle(tuple[TITLE]); return docData; } @Override public void resetInputs() throws IOException { super.resetInputs(); is = openInputStream(); } /** Open the input stream. */ protected InputStream openInputStream() throws IOException { return StreamUtils.inputStream(file); } @Override public void setConfig(Config config) { super.setConfig(config); keepImages = config.get("keep.image.only.docs", true); String fileName = config.get("docs.file", null); if (fileName != null) { file = Paths.get(fileName).toAbsolutePath(); } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy