All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.apache.lucene.benchmark.byTask.feeds.DocMaker Maven / Gradle / Ivy

There is a newer version: 9.11.1
Show newest version
package org.apache.lucene.benchmark.byTask.feeds;

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.Closeable;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.text.ParsePosition;
import java.text.SimpleDateFormat;
import java.util.Calendar;
import java.util.Date;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import java.util.Properties;
import java.util.Random;
import java.util.TimeZone;
import java.util.concurrent.atomic.AtomicInteger;

import org.apache.lucene.benchmark.byTask.utils.Config;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType.NumericType;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.IntField;
import org.apache.lucene.document.LongField;
import org.apache.lucene.document.FloatField;
import org.apache.lucene.document.DoubleField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;

/**
 * Creates {@link Document} objects. Uses a {@link ContentSource} to generate
 * {@link DocData} objects. Supports the following parameters:
 * 
    *
  • content.source - specifies the {@link ContentSource} class to use * (default SingleDocSource). *
  • doc.stored - specifies whether fields should be stored (default * false). *
  • doc.body.stored - specifies whether the body field should be stored (default * = doc.stored). *
  • doc.tokenized - specifies whether fields should be tokenized * (default true). *
  • doc.body.tokenized - specifies whether the * body field should be tokenized (default = doc.tokenized). *
  • doc.tokenized.norms - specifies whether norms should be stored in * the index or not. (default false). *
  • doc.body.tokenized.norms - specifies whether norms should be * stored in the index for the body field. This can be set to true, while * doc.tokenized.norms is set to false, to allow norms storing just * for the body field. (default true). *
  • doc.term.vector - specifies whether term vectors should be stored * for fields (default false). *
  • doc.term.vector.positions - specifies whether term vectors should * be stored with positions (default false). *
  • doc.term.vector.offsets - specifies whether term vectors should be * stored with offsets (default false). *
  • doc.store.body.bytes - specifies whether to store the raw bytes of * the document's content in the document (default false). *
  • doc.reuse.fields - specifies whether Field and Document objects * should be reused (default true). *
  • doc.index.props - specifies whether the properties returned by *
  • doc.random.id.limit - if specified, docs will be assigned random * IDs from 0 to this limit. This is useful with UpdateDoc * for testing performance of IndexWriter.updateDocument. * {@link DocData#getProps()} will be indexed. (default false). *
*/ public class DocMaker implements Closeable { private static class LeftOver { private DocData docdata; private int cnt; } private Random r; private int updateDocIDLimit; /** * Document state, supports reuse of field instances * across documents (see reuseFields parameter). */ protected static class DocState { private final Map fields; private final Map numericFields; private final boolean reuseFields; final Document doc; DocData docData = new DocData(); public DocState(boolean reuseFields, FieldType ft, FieldType bodyFt) { this.reuseFields = reuseFields; if (reuseFields) { fields = new HashMap(); numericFields = new HashMap(); // Initialize the map with the default fields. fields.put(BODY_FIELD, new Field(BODY_FIELD, "", bodyFt)); fields.put(TITLE_FIELD, new Field(TITLE_FIELD, "", ft)); fields.put(DATE_FIELD, new Field(DATE_FIELD, "", ft)); fields.put(ID_FIELD, new StringField(ID_FIELD, "", Field.Store.YES)); fields.put(NAME_FIELD, new Field(NAME_FIELD, "", ft)); numericFields.put(DATE_MSEC_FIELD, new LongField(DATE_MSEC_FIELD, 0L, Field.Store.NO)); numericFields.put(TIME_SEC_FIELD, new IntField(TIME_SEC_FIELD, 0, Field.Store.NO)); doc = new Document(); } else { numericFields = null; fields = null; doc = null; } } /** * Returns a field corresponding to the field name. If * reuseFields was set to true, then it attempts to reuse a * Field instance. If such a field does not exist, it creates a new one. */ Field getField(String name, FieldType ft) { if (!reuseFields) { return new Field(name, "", ft); } Field f = fields.get(name); if (f == null) { f = new Field(name, "", ft); fields.put(name, f); } return f; } Field getNumericField(String name, NumericType type) { Field f; if (reuseFields) { f = numericFields.get(name); } else { f = null; } if (f == null) { switch(type) { case INT: f = new IntField(name, 0, Field.Store.NO); break; case LONG: f = new LongField(name, 0L, Field.Store.NO); break; case FLOAT: f = new FloatField(name, 0.0F, Field.Store.NO); break; case DOUBLE: f = new DoubleField(name, 0.0, Field.Store.NO); break; default: throw new AssertionError("Cannot get here"); } if (reuseFields) { numericFields.put(name, f); } } return f; } } private boolean storeBytes = false; private static class DateUtil { public SimpleDateFormat parser = new SimpleDateFormat("dd-MMM-yyyy HH:mm:ss", Locale.ROOT); public Calendar cal = Calendar.getInstance(TimeZone.getTimeZone("GMT"), Locale.ROOT); public ParsePosition pos = new ParsePosition(0); public DateUtil() { parser.setLenient(true); } } // leftovers are thread local, because it is unsafe to share residues between threads private ThreadLocal leftovr = new ThreadLocal(); private ThreadLocal docState = new ThreadLocal(); private ThreadLocal dateParsers = new ThreadLocal(); public static final String BODY_FIELD = "body"; public static final String TITLE_FIELD = "doctitle"; public static final String DATE_FIELD = "docdate"; public static final String DATE_MSEC_FIELD = "docdatenum"; public static final String TIME_SEC_FIELD = "doctimesecnum"; public static final String ID_FIELD = "docid"; public static final String BYTES_FIELD = "bytes"; public static final String NAME_FIELD = "docname"; protected Config config; protected FieldType valType; protected FieldType bodyValType; protected ContentSource source; protected boolean reuseFields; protected boolean indexProperties; private final AtomicInteger numDocsCreated = new AtomicInteger(); public DocMaker() { } // create a doc // use only part of the body, modify it to keep the rest (or use all if size==0). // reset the docdata properties so they are not added more than once. private Document createDocument(DocData docData, int size, int cnt) throws UnsupportedEncodingException { final DocState ds = getDocState(); final Document doc = reuseFields ? ds.doc : new Document(); doc.getFields().clear(); // Set ID_FIELD FieldType ft = new FieldType(valType); ft.setIndexed(true); Field idField = ds.getField(ID_FIELD, ft); int id; if (r != null) { id = r.nextInt(updateDocIDLimit); } else { id = docData.getID(); if (id == -1) { id = numDocsCreated.getAndIncrement(); } } idField.setStringValue(Integer.toString(id)); doc.add(idField); // Set NAME_FIELD String name = docData.getName(); if (name == null) name = ""; name = cnt < 0 ? name : name + "_" + cnt; Field nameField = ds.getField(NAME_FIELD, valType); nameField.setStringValue(name); doc.add(nameField); // Set DATE_FIELD DateUtil util = dateParsers.get(); if (util == null) { util = new DateUtil(); dateParsers.set(util); } Date date = null; String dateString = docData.getDate(); if (dateString != null) { util.pos.setIndex(0); date = util.parser.parse(dateString, util.pos); //System.out.println(dateString + " parsed to " + date); } else { dateString = ""; } Field dateStringField = ds.getField(DATE_FIELD, valType); dateStringField.setStringValue(dateString); doc.add(dateStringField); if (date == null) { // just set to right now date = new Date(); } Field dateField = ds.getNumericField(DATE_MSEC_FIELD, NumericType.LONG); dateField.setLongValue(date.getTime()); doc.add(dateField); util.cal.setTime(date); final int sec = util.cal.get(Calendar.HOUR_OF_DAY)*3600 + util.cal.get(Calendar.MINUTE)*60 + util.cal.get(Calendar.SECOND); Field timeSecField = ds.getNumericField(TIME_SEC_FIELD, NumericType.INT); timeSecField.setIntValue(sec); doc.add(timeSecField); // Set TITLE_FIELD String title = docData.getTitle(); Field titleField = ds.getField(TITLE_FIELD, valType); titleField.setStringValue(title == null ? "" : title); doc.add(titleField); String body = docData.getBody(); if (body != null && body.length() > 0) { String bdy; if (size <= 0 || size >= body.length()) { bdy = body; // use all docData.setBody(""); // nothing left } else { // attempt not to break words - if whitespace found within next 20 chars... for (int n = size - 1; n < size + 20 && n < body.length(); n++) { if (Character.isWhitespace(body.charAt(n))) { size = n; break; } } bdy = body.substring(0, size); // use part docData.setBody(body.substring(size)); // some left } Field bodyField = ds.getField(BODY_FIELD, bodyValType); bodyField.setStringValue(bdy); doc.add(bodyField); if (storeBytes) { Field bytesField = ds.getField(BYTES_FIELD, StringField.TYPE_STORED); bytesField.setBytesValue(bdy.getBytes("UTF-8")); doc.add(bytesField); } } if (indexProperties) { Properties props = docData.getProps(); if (props != null) { for (final Map.Entry entry : props.entrySet()) { Field f = ds.getField((String) entry.getKey(), valType); f.setStringValue((String) entry.getValue()); doc.add(f); } docData.setProps(null); } } //System.out.println("============== Created doc "+numDocsCreated+" :\n"+doc+"\n=========="); return doc; } private void resetLeftovers() { leftovr.set(null); } protected DocState getDocState() { DocState ds = docState.get(); if (ds == null) { ds = new DocState(reuseFields, valType, bodyValType); docState.set(ds); } return ds; } /** * Closes the {@link DocMaker}. The base implementation closes the * {@link ContentSource}, and it can be overridden to do more work (but make * sure to call super.close()). */ @Override public void close() throws IOException { source.close(); } /** * Creates a {@link Document} object ready for indexing. This method uses the * {@link ContentSource} to get the next document from the source, and creates * a {@link Document} object from the returned fields. If * reuseFields was set to true, it will reuse {@link Document} * and {@link Field} instances. */ public Document makeDocument() throws Exception { resetLeftovers(); DocData docData = source.getNextDocData(getDocState().docData); Document doc = createDocument(docData, 0, -1); return doc; } /** * Same as {@link #makeDocument()}, only this method creates a document of the * given size input by size. */ public Document makeDocument(int size) throws Exception { LeftOver lvr = leftovr.get(); if (lvr == null || lvr.docdata == null || lvr.docdata.getBody() == null || lvr.docdata.getBody().length() == 0) { resetLeftovers(); } DocData docData = getDocState().docData; DocData dd = (lvr == null ? source.getNextDocData(docData) : lvr.docdata); int cnt = (lvr == null ? 0 : lvr.cnt); while (dd.getBody() == null || dd.getBody().length() < size) { DocData dd2 = dd; dd = source.getNextDocData(new DocData()); cnt = 0; dd.setBody(dd2.getBody() + dd.getBody()); } Document doc = createDocument(dd, size, cnt); if (dd.getBody() == null || dd.getBody().length() == 0) { resetLeftovers(); } else { if (lvr == null) { lvr = new LeftOver(); leftovr.set(lvr); } lvr.docdata = dd; lvr.cnt = ++cnt; } return doc; } /** Reset inputs so that the test run would behave, input wise, as if it just started. */ public synchronized void resetInputs() throws IOException { source.printStatistics("docs"); // re-initiate since properties by round may have changed. setConfig(config, source); source.resetInputs(); numDocsCreated.set(0); resetLeftovers(); } /** Set the configuration parameters of this doc maker. */ public void setConfig(Config config, ContentSource source) { this.config = config; this.source = source; boolean stored = config.get("doc.stored", false); boolean bodyStored = config.get("doc.body.stored", stored); boolean tokenized = config.get("doc.tokenized", true); boolean bodyTokenized = config.get("doc.body.tokenized", tokenized); boolean norms = config.get("doc.tokenized.norms", false); boolean bodyNorms = config.get("doc.body.tokenized.norms", true); boolean termVec = config.get("doc.term.vector", false); boolean termVecPositions = config.get("doc.term.vector.positions", false); boolean termVecOffsets = config.get("doc.term.vector.offsets", false); valType = new FieldType(TextField.TYPE_NOT_STORED); valType.setStored(stored); valType.setTokenized(tokenized); valType.setOmitNorms(!norms); valType.setStoreTermVectors(termVec); valType.setStoreTermVectorPositions(termVecPositions); valType.setStoreTermVectorOffsets(termVecOffsets); valType.freeze(); bodyValType = new FieldType(TextField.TYPE_NOT_STORED); bodyValType.setStored(bodyStored); bodyValType.setTokenized(bodyTokenized); bodyValType.setOmitNorms(!bodyNorms); bodyValType.setStoreTermVectors(termVec); bodyValType.setStoreTermVectorPositions(termVecPositions); bodyValType.setStoreTermVectorOffsets(termVecOffsets); bodyValType.freeze(); storeBytes = config.get("doc.store.body.bytes", false); reuseFields = config.get("doc.reuse.fields", true); // In a multi-rounds run, it is important to reset DocState since settings // of fields may change between rounds, and this is the only way to reset // the cache of all threads. docState = new ThreadLocal(); indexProperties = config.get("doc.index.props", false); updateDocIDLimit = config.get("doc.random.id.limit", -1); if (updateDocIDLimit != -1) { r = new Random(179); } } }




© 2015 - 2024 Weber Informatics LLC | Privacy Policy