org.apache.jackrabbit.oak.benchmark.FullTextSearchTest Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of oak-run Show documentation
There is a newer version: 1.72.0
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
package org.apache.jackrabbit.oak.benchmark;

import static com.google.common.collect.Lists.newArrayList;
import static com.google.common.collect.Sets.newHashSet;

import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.Random;
import java.util.Set;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.jcr.Node;
import javax.jcr.Repository;
import javax.jcr.Session;
import javax.jcr.query.Query;
import javax.jcr.query.QueryManager;
import javax.jcr.query.QueryResult;
import javax.jcr.query.RowIterator;

import org.apache.commons.io.FileUtils;
import org.apache.jackrabbit.oak.Oak;
import org.apache.jackrabbit.oak.benchmark.wikipedia.WikipediaImport;
import org.apache.jackrabbit.oak.fixture.JcrCreator;
import org.apache.jackrabbit.oak.fixture.OakRepositoryFixture;
import org.apache.jackrabbit.oak.fixture.RepositoryFixture;
import org.apache.jackrabbit.oak.jcr.Jcr;
import org.apache.jackrabbit.oak.plugins.index.lucene.IndexCopier;
import org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexEditorProvider;
import org.apache.jackrabbit.oak.plugins.index.lucene.LuceneIndexProvider;
import org.apache.jackrabbit.oak.plugins.index.lucene.util.LuceneInitializerHelper;
import org.apache.jackrabbit.oak.spi.commit.Observer;
import org.apache.jackrabbit.oak.spi.query.QueryIndexProvider;

public class FullTextSearchTest extends AbstractTest {

    /**
     * Pattern used to find words and other searchable tokens within the
     * imported Wikipedia pages.
     */
    private static final Pattern WORD_PATTERN =
            Pattern.compile("\\p{LD}{3,}");

    private int maxSampleSize = 100;

    private final boolean disableCopyOnRead = Boolean.getBoolean("disableCopyOnRead");

    private final WikipediaImport importer;

    private final Set sampleSet = newHashSet();

    private final Random random = new Random(42); //fixed seed

    private int count = 0;

    private int maxRowsToFetch = Integer.getInteger("maxRowsToFetch",100);

    private TestContext defaultContext;

    /**
     * null means true; true means true
     */
    protected Boolean storageEnabled;

    private ExecutorService executorService = Executors.newFixedThreadPool(2);

    private File indexCopierDir;

    public FullTextSearchTest(File dump, boolean flat, boolean doReport, Boolean storageEnabled) {
        this.importer = new WikipediaImport(dump, flat, doReport) {
            @Override
            protected void pageAdded(String title, String text) {
                count++;
                if (count % 100 == 0
                        && sampleSet.size() < maxSampleSize
                        && text != null) {
                    List words = newArrayList();

                    Matcher matcher = WORD_PATTERN.matcher(text);
                    while (matcher.find()) {
                        words.add(matcher.group());
                    }

                    if (!words.isEmpty()) {
                        sampleSet.add(words.get(words.size() / 2));
                    }
                }
            }
        };
        this.storageEnabled = storageEnabled;
        this.indexCopierDir = createTemporaryFolder(null);
    }

    @Override
    public void beforeSuite() throws Exception {
        random.setSeed(42);
        sampleSet.clear();
        count = 0;

        importer.importWikipedia(loginWriter());
        Thread.sleep(10); // allow some time for the indexer to catch up

        defaultContext = new TestContext();
    }

    @Override
    protected void afterSuite() throws Exception {
        executorService.shutdown();
        executorService.awaitTermination(1, TimeUnit.MINUTES);
        FileUtils.deleteDirectory(indexCopierDir);
    }

    @Override
    protected TestContext prepareThreadExecutionContext() {
        return new TestContext();
    }

    @Override
    protected void runTest() throws Exception {
        runTest(defaultContext);
    }

    @SuppressWarnings("deprecation")
    @Override
    protected void runTest(TestContext ec)  throws Exception {
        QueryManager qm = ec.session.getWorkspace().getQueryManager();
        // TODO verify why "order by jcr:score()" accounts for what looks
        // like > 20% of the perf lost in Collections.sort
        for (String word : ec.words) {
            Query q = qm.createQuery("//*[jcr:contains(@text, '" + word + "')] ", Query.XPATH);
            QueryResult r = q.execute();
            RowIterator it = r.getRows();
            for (int rows = 0; it.hasNext() && rows < maxRowsToFetch; rows++) {
                Node n = it.nextRow().getNode();
                ec.hash += n.getProperty("text").getString().hashCode();
                ec.hash += n.getProperty("title").getString().hashCode();
            }
        }
    }

    class TestContext {
        final Session session = loginWriter();
        final String[] words = getRandomWords();
        int hash = 0; // summary variable to prevent JIT compiler tricks
    }

    private String[] getRandomWords() {
        List samples = newArrayList(sampleSet);
        String[] words = new String[100];
        for (int i = 0; i < words.length; i++) {
            words[i] = samples.get(random.nextInt(samples.size()));
        }
        return words;
    }

    @Override
    protected Repository[] createRepository(RepositoryFixture fixture) throws Exception {
        if (fixture instanceof OakRepositoryFixture) {
            return ((OakRepositoryFixture) fixture).setUpCluster(1, new JcrCreator() {
                @Override
                public Jcr customize(Oak oak) {
                    LuceneIndexProvider provider = createLuceneIndexProvider();
                    oak.with((QueryIndexProvider) provider)
                            .with((Observer) provider)
                            .with(new LuceneIndexEditorProvider())
                            .with(new LuceneInitializerHelper("luceneGlobal", storageEnabled));
                    return new Jcr(oak);
                }
            });
        }
        return super.createRepository(fixture);
    }

    private LuceneIndexProvider createLuceneIndexProvider() {
        if (!disableCopyOnRead) {
            try {
                IndexCopier copier = new IndexCopier(executorService, indexCopierDir, true);
                return new LuceneIndexProvider(copier);
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
        }
        return new LuceneIndexProvider();
    }

    private File createTemporaryFolder(File parentFolder){
        File createdFolder = null;
        try {
            createdFolder = File.createTempFile("oak", "", parentFolder);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
        createdFolder.delete();
        createdFolder.mkdir();
        return createdFolder;
    }
}