All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.benchmark.byTask.feeds.LineDocSource Maven / Gradle / Ivy

The newest version!
/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package org.apache.lucene.benchmark.byTask.feeds;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.lang.reflect.Constructor;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.Properties;
import org.apache.lucene.benchmark.byTask.tasks.WriteLineDocTask;
import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.benchmark.byTask.utils.StreamUtils;
import org.apache.lucene.util.IOUtils;

/**
 * A {@link ContentSource} reading one line at a time as a {@link
 * org.apache.lucene.document.Document} from a single file. This saves IO cost (over
 * DirContentSource) of recursing through a directory and opening a new file for every document.
* The expected format of each line is (arguments are separated by <TAB>): title, date, * body. If a line is read in a different format, a {@link RuntimeException} will be thrown. In * general, you should use this content source for files that were created with {@link * WriteLineDocTask}.
*
* Config properties: * *
    *
  • docs.file=<path to the file> *
  • content.source.encoding - default to UTF-8. *
  • line.parser - default to {@link HeaderLineParser} if a header line exists which differs * from {@link WriteLineDocTask#DEFAULT_FIELDS} and to {@link SimpleLineParser} otherwise. *
*/ public class LineDocSource extends ContentSource { /** Reader of a single input line into {@link DocData}. */ public abstract static class LineParser { protected final String[] header; /** * Construct with the header * * @param header header line found in the input file, or null if none */ public LineParser(String[] header) { this.header = header; } /** parse an input line and fill doc data appropriately */ public abstract void parseLine(DocData docData, String line); } /** * {@link LineParser} which ignores the header passed to its constructor and assumes simply that * field names and their order are the same as in {@link WriteLineDocTask#DEFAULT_FIELDS} */ public static class SimpleLineParser extends LineParser { public SimpleLineParser(String[] header) { super(header); } @Override public void parseLine(DocData docData, String line) { int k1 = 0; int k2 = line.indexOf(WriteLineDocTask.SEP, k1); if (k2 < 0) { throw new RuntimeException( "line: [" + line + "] is in an invalid format (missing: separator title::date)!"); } docData.setTitle(line.substring(k1, k2)); k1 = k2 + 1; k2 = line.indexOf(WriteLineDocTask.SEP, k1); if (k2 < 0) { throw new RuntimeException( "line: [" + line + "] is in an invalid format (missing: separator date::body)!"); } docData.setDate(line.substring(k1, k2)); k1 = k2 + 1; k2 = line.indexOf(WriteLineDocTask.SEP, k1); if (k2 >= 0) { throw new RuntimeException( "line: [" + line + "] is in an invalid format (too many separators)!"); } // last one docData.setBody(line.substring(k1)); } } /** * {@link LineParser} which sets field names and order by the header - any header - of the lines * file. It is less efficient than {@link SimpleLineParser} but more powerful. */ public static class HeaderLineParser extends LineParser { private enum FieldName { NAME, TITLE, DATE, BODY, PROP } private final FieldName[] posToF; public HeaderLineParser(String[] header) { super(header); posToF = new FieldName[header.length]; for (int i = 0; i < header.length; i++) { String f = header[i]; if (DocMaker.NAME_FIELD.equals(f)) { posToF[i] = FieldName.NAME; } else if (DocMaker.TITLE_FIELD.equals(f)) { posToF[i] = FieldName.TITLE; } else if (DocMaker.DATE_FIELD.equals(f)) { posToF[i] = FieldName.DATE; } else if (DocMaker.BODY_FIELD.equals(f)) { posToF[i] = FieldName.BODY; } else { posToF[i] = FieldName.PROP; } } } @Override public void parseLine(DocData docData, String line) { int n = 0; int k1 = 0; int k2; while ((k2 = line.indexOf(WriteLineDocTask.SEP, k1)) >= 0) { if (n >= header.length) { throw new RuntimeException( "input line has invalid format: " + (n + 1) + " fields instead of " + header.length + " :: [" + line + "]"); } setDocDataField(docData, n, line.substring(k1, k2)); ++n; k1 = k2 + 1; } if (n != header.length - 1) { throw new RuntimeException( "input line has invalid format: " + (n + 1) + " fields instead of " + header.length + " :: [" + line + "]"); } // last one setDocDataField(docData, n, line.substring(k1)); } private void setDocDataField(DocData docData, int position, String text) { switch (posToF[position]) { case NAME: docData.setName(text); break; case TITLE: docData.setTitle(text); break; case DATE: docData.setDate(text); break; case BODY: docData.setBody(text); break; case PROP: Properties p = docData.getProps(); if (p == null) { p = new Properties(); docData.setProps(p); } p.setProperty(header[position], text); break; } } } private Path file; private BufferedReader reader; private int readCount; private LineParser docDataLineReader = null; private boolean skipHeaderLine = false; private synchronized void openFile() { try { IOUtils.close(reader); InputStream is = StreamUtils.inputStream(file); reader = new BufferedReader(new InputStreamReader(is, encoding), StreamUtils.BUFFER_SIZE); if (skipHeaderLine) { reader.readLine(); // skip one line - the header line - already handled that info } } catch (IOException e) { throw new RuntimeException(e); } } @Override public void close() throws IOException { IOUtils.close(reader); reader = null; } @Override public DocData getNextDocData(DocData docData) throws NoMoreDataException, IOException { final String line; final int myID; synchronized (this) { line = reader.readLine(); if (line == null) { if (!forever) { throw new NoMoreDataException(); } // Reset the file openFile(); return getNextDocData(docData); } if (docDataLineReader == null) { // first line ever, one time initialization, docDataLineReader = createDocDataLineReader(line); if (skipHeaderLine) { return getNextDocData(docData); } } // increment IDS only once... myID = readCount++; } // The date String was written in the format of DateTools.dateToString. docData.clear(); docData.setID(myID); docDataLineReader.parseLine(docData, line); return docData; } private LineParser createDocDataLineReader(String line) { String[] header; String headIndicator = WriteLineDocTask.FIELDS_HEADER_INDICATOR + WriteLineDocTask.SEP; if (line.startsWith(headIndicator)) { header = line.substring(headIndicator.length()).split(Character.toString(WriteLineDocTask.SEP)); skipHeaderLine = true; // mark to skip the header line when input file is reopened } else { header = WriteLineDocTask.DEFAULT_FIELDS; } // if a specific DocDataLineReader was configured, must respect it String docDataLineReaderClassName = getConfig().get("line.parser", null); if (docDataLineReaderClassName != null) { try { final Class clazz = Class.forName(docDataLineReaderClassName).asSubclass(LineParser.class); Constructor cnstr = clazz.getConstructor(String[].class); return cnstr.newInstance((Object) header); } catch (Exception e) { throw new RuntimeException("Failed to instantiate " + docDataLineReaderClassName, e); } } // if this the simple case, if (Arrays.deepEquals(header, WriteLineDocTask.DEFAULT_FIELDS)) { return new SimpleLineParser(header); } return new HeaderLineParser(header); } @Override public void resetInputs() throws IOException { super.resetInputs(); openFile(); } @Override public void setConfig(Config config) { super.setConfig(config); String fileName = config.get("docs.file", null); if (fileName == null) { throw new IllegalArgumentException("docs.file must be set"); } file = Paths.get(fileName).toAbsolutePath(); if (encoding == null) { encoding = IOUtils.UTF_8; } } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy