org.apache.pinot.perf.RawIndexBenchmark Maven / Gradle / Ivy
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.pinot.perf;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Random;
import org.apache.commons.io.FileUtils;
import org.apache.pinot.core.operator.DocIdSetOperator;
import org.apache.pinot.core.operator.ProjectionOperator;
import org.apache.pinot.core.operator.blocks.ProjectionBlock;
import org.apache.pinot.core.operator.docvalsets.ProjectionBlockValSet;
import org.apache.pinot.core.operator.filter.BaseFilterOperator;
import org.apache.pinot.core.operator.filter.TestFilterOperator;
import org.apache.pinot.core.plan.DocIdSetPlanNode;
import org.apache.pinot.segment.local.indexsegment.immutable.ImmutableSegmentLoader;
import org.apache.pinot.segment.local.segment.creator.impl.SegmentIndexCreationDriverImpl;
import org.apache.pinot.segment.local.segment.readers.GenericRowRecordReader;
import org.apache.pinot.segment.spi.IndexSegment;
import org.apache.pinot.segment.spi.V1Constants;
import org.apache.pinot.segment.spi.creator.SegmentGeneratorConfig;
import org.apache.pinot.segment.spi.datasource.DataSource;
import org.apache.pinot.spi.config.table.TableConfig;
import org.apache.pinot.spi.config.table.TableType;
import org.apache.pinot.spi.data.DimensionFieldSpec;
import org.apache.pinot.spi.data.FieldSpec;
import org.apache.pinot.spi.data.Schema;
import org.apache.pinot.spi.data.readers.GenericRow;
import org.apache.pinot.spi.utils.ReadMode;
import org.apache.pinot.spi.utils.builder.TableConfigBuilder;
import picocli.CommandLine;
/**
* Class to perform benchmark on lookups for dictionary encoded fwd index v.s. raw index without dictionary.
* It can take an existing segment with two columns to compare. It can also create a segment on the fly with a
* given input file containing strings (one string per line).
*/
@SuppressWarnings({"FieldCanBeLocal", "unused"})
@CommandLine.Command
public class RawIndexBenchmark {
private static final String SEGMENT_DIR_NAME = System.getProperty("java.io.tmpdir") + File.separator + "rawIndexPerf";
private static final String SEGMENT_NAME = "perfTestSegment";
private static final int NUM_COLUMNS = 2;
private static final String DEFAULT_RAW_INDEX_COLUMN = "column_0";
private static final String DEFAULT_FWD_INDEX_COLUMN = "column_1";
private static final int DEFAULT_NUM_LOOKUP = 100_000;
private static final int DEFAULT_NUM_CONSECUTIVE_LOOKUP = 50;
@CommandLine.Option(names = {"-segmentDir"}, required = false, description = "Untarred segment")
private String _segmentDir = null;
@CommandLine.Option(names = {"-fwdIndexColumn"}, required = false,
description = "Name of column with dictionary encoded index")
private String _fwdIndexColumn = DEFAULT_FWD_INDEX_COLUMN;
@CommandLine.Option(names = {"-rawIndexColumn"}, required = false,
description = "Name of column with raw index (no-dictionary")
private String _rawIndexColumn = DEFAULT_RAW_INDEX_COLUMN;
@CommandLine.Option(names = {"-dataFile"}, required = false,
description = "File containing input data (one string per line)")
private String _dataFile = null;
@CommandLine.Option(names = {"-loadMode"}, required = false, description = "Load mode for data (mmap|heap")
private String _loadMode = "heap";
@CommandLine.Option(names = {"-numLookups"}, required = false,
description = "Number of lookups to be performed for benchmark")
private int _numLookups = DEFAULT_NUM_LOOKUP;
@CommandLine.Option(names = {"-numConsecutiveLookups"}, required = false,
description = "Number of consecutive docIds to lookup")
private int _numConsecutiveLookups = DEFAULT_NUM_CONSECUTIVE_LOOKUP;
@CommandLine.Option(names = {"-help", "-h", "--h", "--help"}, required = false, help = true,
description = "print this message")
private boolean _help = false;
private int _numRows = 0;
public void run()
throws Exception {
if (_segmentDir == null && _dataFile == null) {
System.out.println("Error: One of 'segmentDir' or 'dataFile' must be specified");
return;
}
File segmentFile = (_segmentDir == null) ? buildSegment() : new File(_segmentDir);
IndexSegment segment = ImmutableSegmentLoader.load(segmentFile, ReadMode.valueOf(_loadMode));
compareIndexSizes(segment, segmentFile, _fwdIndexColumn, _rawIndexColumn);
compareLookups(segment);
// Cleanup the temporary directory
if (_segmentDir != null) {
FileUtils.deleteQuietly(new File(SEGMENT_DIR_NAME));
}
segment.destroy();
}
/**
* Helper method that builds a segment containing two columns both with data from input file.
* The first column has raw indices (no dictionary), where as the second column is dictionary encoded.
*
* @throws Exception
*/
private File buildSegment()
throws Exception {
Schema schema = new Schema();
for (int i = 0; i < NUM_COLUMNS; i++) {
String column = "column_" + i;
DimensionFieldSpec dimensionFieldSpec = new DimensionFieldSpec(column, FieldSpec.DataType.STRING, true);
schema.addField(dimensionFieldSpec);
}
TableConfig tableConfig = new TableConfigBuilder(TableType.OFFLINE).setTableName("test").build();
SegmentGeneratorConfig config = new SegmentGeneratorConfig(tableConfig, schema);
config.setRawIndexCreationColumns(Collections.singletonList(_rawIndexColumn));
config.setOutDir(SEGMENT_DIR_NAME);
config.setSegmentName(SEGMENT_NAME);
BufferedReader reader = new BufferedReader(new FileReader(_dataFile));
String value;
final List rows = new ArrayList<>();
System.out.println("Reading data...");
while ((value = reader.readLine()) != null) {
HashMap map = new HashMap<>();
for (FieldSpec fieldSpec : schema.getAllFieldSpecs()) {
map.put(fieldSpec.getName(), value);
}
GenericRow genericRow = new GenericRow();
genericRow.init(map);
rows.add(genericRow);
_numRows++;
if (_numRows % 1000000 == 0) {
System.out.println("Read rows: " + _numRows);
}
}
System.out.println("Generating segment...");
SegmentIndexCreationDriverImpl driver = new SegmentIndexCreationDriverImpl();
driver.init(config, new GenericRowRecordReader(rows));
driver.build();
return new File(SEGMENT_DIR_NAME, SEGMENT_NAME);
}
/**
* Compares and prints the index size for the raw and dictionary encoded columns.
*
* @param segment Segment to compare
*/
private void compareIndexSizes(IndexSegment segment, File segmentDir, String fwdIndexColumn, String rawIndexColumn) {
String filePrefix = segmentDir.getAbsolutePath() + File.separator;
File rawIndexFile = new File(filePrefix + rawIndexColumn + V1Constants.Indexes.RAW_SV_FORWARD_INDEX_FILE_EXTENSION);
String extension = (segment.getDataSource(_fwdIndexColumn).getDataSourceMetadata().isSorted())
? V1Constants.Indexes.SORTED_SV_FORWARD_INDEX_FILE_EXTENSION
: V1Constants.Indexes.UNSORTED_SV_FORWARD_INDEX_FILE_EXTENSION;
File fwdIndexFile = new File(filePrefix + _fwdIndexColumn + extension);
File fwdIndexDictFile = new File(filePrefix + _fwdIndexColumn + V1Constants.Dict.FILE_EXTENSION);
long rawIndexSize = rawIndexFile.length();
long fwdIndexSize = fwdIndexFile.length() + fwdIndexDictFile.length();
System.out.println("Raw index size: " + toMegaBytes(rawIndexSize) + " MB.");
System.out.println("Fwd index size: " + toMegaBytes(fwdIndexSize) + " MB.");
System.out.println("Storage space saving: " + ((fwdIndexSize - rawIndexSize) * 100.0 / fwdIndexSize) + " %");
}
/**
* Compares lookup times for the two columns.
* Performs {@link #_numConsecutiveLookups} on the two columns on randomly generated docIds.
*
* @param segment Segment to compare the columns for
*/
private void compareLookups(IndexSegment segment) {
int[] filteredDocIds = generateDocIds(segment);
long rawIndexTime = profileLookups(segment, _rawIndexColumn, filteredDocIds);
long fwdIndexTime = profileLookups(segment, _fwdIndexColumn, filteredDocIds);
System.out.println("Raw index lookup time: " + rawIndexTime);
System.out.println("Fwd index lookup time: " + fwdIndexTime);
System.out.println("Percentage change: " + ((fwdIndexTime - rawIndexTime) * 100.0 / rawIndexTime) + " %");
}
/**
* Profiles the lookup time for a given column, for the given docIds.
*
* @param segment Segment to profile
* @param column Column to profile
* @param docIds DocIds to lookup on the column
* @return Time take in millis for the lookups
*/
private long profileLookups(IndexSegment segment, String column, int[] docIds) {
BaseFilterOperator filterOperator = new TestFilterOperator(docIds);
DocIdSetOperator docIdSetOperator = new DocIdSetOperator(filterOperator, DocIdSetPlanNode.MAX_DOC_PER_CALL);
ProjectionOperator projectionOperator = new ProjectionOperator(buildDataSourceMap(segment), docIdSetOperator);
long start = System.currentTimeMillis();
ProjectionBlock projectionBlock;
while ((projectionBlock = projectionOperator.nextBlock()) != null) {
ProjectionBlockValSet blockValueSet = (ProjectionBlockValSet) projectionBlock.getBlockValueSet(column);
blockValueSet.getDoubleValuesSV();
}
return (System.currentTimeMillis() - start);
}
/**
* Convert from bytes to mega-bytes.
*
* @param sizeInBytes Size to convert
* @return Size in MB's
*/
private double toMegaBytes(long sizeInBytes) {
return sizeInBytes / (1024 * 1024);
}
/**
* Helper method to build map from column to data source
*
* @param segment Segment for which to build the map
* @return Column to data source map
*/
private Map buildDataSourceMap(IndexSegment segment) {
Map dataSourceMap = new HashMap<>();
for (String column : segment.getPhysicalColumnNames()) {
dataSourceMap.put(column, segment.getDataSource(column));
}
return dataSourceMap;
}
/**
* Generate random docIds.
*
* - Total of {@link #_numLookups} docIds are generated.
* - DocId's are in clusters containing {@link #_numConsecutiveLookups} ids.
*
* @param segment
* @return
*/
private int[] generateDocIds(IndexSegment segment) {
Random random = new Random();
int numDocs = segment.getSegmentMetadata().getTotalDocs();
int maxDocId = numDocs - _numConsecutiveLookups - 1;
int[] docIdSet = new int[_numLookups];
int j = 0;
for (int i = 0; i < (_numLookups / _numConsecutiveLookups); i++) {
int startDocId = random.nextInt(maxDocId);
int endDocId = startDocId + _numConsecutiveLookups;
for (int docId = startDocId; docId < endDocId; docId++) {
docIdSet[j++] = docId;
}
}
int docId = random.nextInt(maxDocId);
for (; j < _numLookups; j++) {
docIdSet[j] = docId++;
}
return docIdSet;
}
/**
* Main method for the class. Parses the command line arguments, and invokes the benchmark.
*
* @param args Command line arguments.
* @throws Exception
*/
public static void main(String[] args)
throws Exception {
RawIndexBenchmark benchmark = new RawIndexBenchmark();
CommandLine commandLine = new CommandLine(benchmark);
commandLine.parseArgs(args);
benchmark.run();
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy