org.riversun.bigdoc.bin.BigFileSearcher Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of bigdoc Show documentation
This library allows you to handle gigabyte order huge files easily with high performance. You can search bytes or words / read data/text from huge files.
The newest version!
/*  bigdoc Java lib for easy to read/search from a big document
 *
 *  Copyright (c) 2006-2016 Tom Misawa, [email protected]
 *  
 *  Permission is hereby granted, free of charge, to any person obtaining a
 *  copy of this software and associated documentation files (the "Software"),
 *  to deal in the Software without restriction, including without limitation
 *  the rights to use, copy, modify, merge, publish, distribute, sublicense,
 *  and/or sell copies of the Software, and to permit persons to whom the
 *  Software is furnished to do so, subject to the following conditions:
 *  
 *  The above copyright notice and this permission notice shall be included in
 *  all copies or substantial portions of the Software.
 *  
 *  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 *  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 *  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 *  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 *  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 *  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
 *  DEALINGS IN THE SOFTWARE.
 *  
 */
package org.riversun.bigdoc.bin;

import java.io.File;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.CancellationException;
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.atomic.AtomicBoolean;

import org.riversun.bigdoc.bin.BinFileSearcher.BinFileProgressListener;

/**
 * 
 * Search bytes from big file

 * Available for giga-bytes order file 

 * 
 * @author Tom Misawa ([email protected])
 */
public class BigFileSearcher {

  public static interface OnProgressListener {
    public void onProgress(float progress);
  }

  public static interface OnRealtimeResultListener {
    public void onRealtimeResultListener(float progress, List pointerList);
  }

  private interface BinFileProgressListenerEx {
    public void onProgress(int workerNumber, int workerSize, List pointerList, float progress);
  }

  private ProgressCache progressCache;

  private boolean useOptimization = true;

  /**
   * Use memory and threading optimization

   * (default is true)
   * 
   * @param enabled
   *                optimization enabled or not
   */
  public void setUseOptimization(boolean enabled) {
    this.useOptimization = enabled;
  }

  private final BinFileSearcher binFileSearcher = new BinFileSearcher();

  // max number of thread
  public static final int DEFAULT_MAX_NUM_OF_THREADS = 24;

  // thread number no limit
  public static final int THREADS_NO_LIMIT = 0;

  // Unit size when split loading
  public static final int DEFAULT_BLOCK_SIZE = 10 * 1024 * 1024;

  private int bufferSizePerWorker = BinFileSearcher.DEFAULT_BUFFER_SIZE;

  /**
   * Number of threads used at the same time in one search
   */
  private int subThreadSize = BinFileSearcher.DEFAULT_SUB_THREAD_SIZE;

  /**
   * The size of the window used to scan memory
   */
  private int subBufferSize = BinFileSearcher.DEFAULT_SUB_BUFFER_SIZE;

  /**
   * Size per unit when divide loading big sized file into multiple pieces

   */
  private long blockSize = DEFAULT_BLOCK_SIZE;

  private int maxNumOfThreads = DEFAULT_MAX_NUM_OF_THREADS;

  private OnProgressListener onProgressListener;
  private OnRealtimeResultListener onRealtimeResultListener;

  private long _profile_lastStartTime;
  private long _profile_lastEndTime;

  /**
   * Set size per unit when divide loading big sized file into multiple pieces

   * 

   * In order to make this method effective,call setUseOptimization(false) to
   * turn off the optimization.

   * 
   * @param blockSize
   *                  size per unit when divide loading big sized file
   */
  public void setBlockSize(long blockSize) {
    this.blockSize = blockSize;
  }

  /**
   * Set size to be read into memory at one search 

   * In order to make this method effective,call setUseOptimization(false) to
   * turn off the optimization.

   * 
   * @param bufferSize
   *                   size(byte) to be read into memory at one search operation
   */
  public void setBufferSizePerWorker(int bufferSize) {
    this.bufferSizePerWorker = bufferSize;
  }

  /**
   * Set max number of thread to concurrent load to file 

   * Increased the number of threads does not means improving a performance

   * 

   * In order to make this method effective,call setUseOptimization(false) to
   * turn off the optimization.

   * 
   * @param maxNumOfThreads
   *                        number of threads(concurrency)
   */
  public void setMaxNumOfThreads(int maxNumOfThreads) {
    this.maxNumOfThreads = maxNumOfThreads;
  }

  /**
   * Set number of threads used in each worker 

   * In order to make this method effective,call setUseOptimization(false) to
   * turn off the optimization.

   * 
   * @param subThreadSize
   *                      number of threads for sub threads(concurrency)
   */
  public void setSubThreadSize(int subThreadSize) {
    this.subThreadSize = subThreadSize;
  }

  /**
   * Set the size of the window used to scan memory used in each worker 

   * In order to make this method effective,call setUseOptimization(false) to
   * turn off the optimization.

   * 
   * @param subBufferSize
   *                      size(bytes) of the window
   */
  public void setSubBufferSize(int subBufferSize) {
    this.subBufferSize = subBufferSize;
  }

  /**
   * Returns the index within this file of the first occurrence of the
   * specified substring.
   * 
   * @param f
   *                    target file
   * @param searchBytes
   *                    sequence of bytes you want to search
   * @return
   */
  public Long indexOf(File f, byte[] searchBytes) {
    return indexOf(f, searchBytes, 0);
  }

  /**
   * Returns the index within this file of the first occurrence of the
   * specified substring, starting at the specified position.
   * 
   * @param f
   *                     target file
   * @param searchBytes
   *                     a sequence of bytes you want to find
   * @param fromPosition
   *                     "0" means the beginning of the file
   * @return position of the first occurence. '-1' means that it was not
   *         found.
   */
  public Long indexOf(File f, byte[] searchBytes, long fromPosition) {
    return binFileSearcher.indexOf(f, searchBytes, fromPosition);
  }

  /**
   * Search bytes from big file faster with realtime result callback

   * 

   * This callbacks the result in real time, but since the concurrency is
   * inferior to #searchBigFile,so the execution speed is slower than
   * #searchBigFile
   * 
   * @param f
   *                    targetFile
   * @param searchBytes
   *                    sequence of bytes you want to search
   * @param listener
   *                    callback for progress and realtime result
   */
  public List searchBigFileRealtime(File f, byte[] searchBytes, OnRealtimeResultListener listener) {
    return searchBigFileRealtime(f, searchBytes, 0, listener);
  }

  /**
   * * Search bytes from big file faster with realtime result callback

   * 

   * This callbacks the result in real time, but since the concurrency is
   * inferior to #searchBigFile,so the execution speed is slower than
   * #searchBigFile
   * 
   * @param f
   *                      targetFile
   * @param searchBytes
   *                      sequence of bytes you want to search
   * @param startPosition
   *                      starting position
   * @param listener
   *                      callback for progress and realtime result
   * @return
   */
  public List searchBigFileRealtime(File f, byte[] searchBytes, long startPosition, OnRealtimeResultListener listener) {

    this.onRealtimeResultListener = listener;
    this.onProgressListener = null;

    int numOfThreadsOptimized = (int) (f.length() / blockSize);

    if (numOfThreadsOptimized == 0) {
      numOfThreadsOptimized = 1;
    }

    final long fileLen = f.length();

    // optimize before calling the method
    optimize(fileLen);

    setMaxNumOfThreads(1);
    setBlockSize(fileLen);

    return searchBigFile(f, searchBytes, numOfThreadsOptimized, false, startPosition);
  }

  /**
   * Search bytes from big file faster in a concurrent processing with
   * progress callback
   * 
   * @param f
   *                    target file
   * @param searchBytes
   *                    sequence of bytes you want to search
   * @return
   */
  public List searchBigFile(File f, byte[] searchBytes) {
    return searchBigFile(f, searchBytes, null);
  }

  /**
   * Search bytes from big file faster in a concurrent processing with
   * progress callback
   * 
   * @param f
   *                    target file
   * @param searchBytes
   *                    sequence of bytes you want to search
   * @param listener
   *                    callback for progress
   * @return
   */
  public List searchBigFile(File f, byte[] searchBytes, OnProgressListener listener) {

    this.onRealtimeResultListener = null;
    this.onProgressListener = listener;

    int numOfThreadsOptimized = (int) (f.length() / (long) blockSize);

    if (numOfThreadsOptimized == 0) {
      numOfThreadsOptimized = 1;
    }

    return searchBigFile(f, searchBytes, numOfThreadsOptimized, this.useOptimization, 0);
  }

  /**
   * Search bytes faster in a concurrent processing with concurrency level.
   * 
   * @param srcFile
   *                        target file
   * @param searchBytes
   *                        sequence of bytes you want to search
   * @param numOfThreads
   *                        number of threads
   * @param useOptimization
   *                        use optimization or not
   * @param startPosition
   *                        starting position
   * @return
   */
  private List searchBigFile(File srcFile, byte[] searchBytes, int numOfThreads, boolean useOptimization, long startPosition) {
    SearchCondition sc = new SearchCondition();
    sc.srcFile = srcFile;
    sc.searchBytes = searchBytes;
    sc.numOfThreads = numOfThreads;
    sc.useOptimization = useOptimization;
    sc.startPosition = startPosition;
    return searchBigFile(sc);
  }

  public List mTaskList;

  /**
   * Cancel searching
   */
  public void cancel() {
    if (mTaskList != null) {
      for (BigFileSearchTask task : mTaskList) {
        task.cancel();
      }
    }
  }

  public List searchBigFile(SearchCondition sc) {

    this.onRealtimeResultListener = sc.onRealtimeResultListener;
    this.onProgressListener = sc.onProgressListener;

    progressCache = null;

    final long endPosition;

    if (sc.endPosition == -1) {
      endPosition = sc.srcFile.length() - 1;
    } else {
      endPosition = sc.endPosition;
    }
    final long sizeOfSrcBytes = sc.srcFile.length();

    if (useOptimization || sc.useOptimization) {
      optimize(sc.srcFile.length());
    }

    _profile_lastStartTime = System.currentTimeMillis();

    if (sc.numOfThreads == 0) {
      sc.numOfThreads = 1;
    }

    final int sizeOfSearchBytes = sc.searchBytes.length;
    final long bytesToReadBlockSize = (sizeOfSrcBytes - (long) sizeOfSearchBytes) / (long) sc.numOfThreads;

    final int threadPoolSize;

    if (maxNumOfThreads == THREADS_NO_LIMIT) {
      threadPoolSize = sc.numOfThreads;
    } else {
      threadPoolSize = maxNumOfThreads;
    }

    final ExecutorService executorService = Executors.newFixedThreadPool(threadPoolSize);

    List>> futureList = new ArrayList>>();

    for (int i = 0; i < sc.numOfThreads; i++) {

      final long offset = bytesToReadBlockSize * (long) i + sc.startPosition;
      final long readLeng;

      if (i == sc.numOfThreads - 1) {
        // if it's the last element.
        readLeng = sizeOfSrcBytes - offset;
      } else {
        // else , add the overlapping part size to blockSize
        readLeng = bytesToReadBlockSize + sizeOfSearchBytes;
      }

      final BinFileProgressListenerEx progressListener;

      if (onProgressListener == null && onRealtimeResultListener == null) {
        progressListener = null;
      } else {

        progressListener = new BinFileProgressListenerEx() {

          @Override
          public void onProgress(int workerNumber, int workerSize, List pointerList, float progress) {
            BigFileSearcher.this.onProgress(workerNumber, workerSize, pointerList, progress);
          }
        };
      }

      final int workerSize = sc.numOfThreads;
      final int workerNumber = i;

      final BigFileSearchTask task = new BigFileSearchTask(sc.srcFile, sc.searchBytes, offset, readLeng, workerNumber, workerSize, progressListener);
      if (mTaskList == null) {
        mTaskList = new ArrayList();
      }
      mTaskList.add(task);
      final Future> future = executorService.submit(task);

      futureList.add(future);
    }
    executorService.shutdown();

    // Remove duplicate indexes
    final List resultIndexList = new CopyOnWriteArrayList();

    for (Future> future : futureList) {

      try {
        List rawIndexList = future.get();

        for (int i = 0; i < rawIndexList.size(); i++) {

          Long longVal = rawIndexList.get(i);

          if (resultIndexList.contains(longVal)) {
            // if already exists , skip
          } else {
            resultIndexList.add(longVal);
          }
        }
      } catch (InterruptedException e) {
        e.printStackTrace();
      } catch (ExecutionException e) {
        e.printStackTrace();
      } catch (CancellationException e) {
        e.printStackTrace();
      } catch (Exception e) {
        e.printStackTrace();
      }
    }
    // Sort in ascending order
    binFileSearcher.sort(resultIndexList);

    _profile_lastEndTime = System.currentTimeMillis();

    return resultIndexList;
  }

  private final class BigFileSearchTask implements Callable> {

    final int workerSize;
    final int workerNumber;
    final File srcFile;
    final byte[] searchBytes;

    final long startPostion;
    final long readLeng;

    final BinFileProgressListenerEx binFileProgressListener;

    BigFileSearchTask(File srcFile, byte[] searchBytes, long startPosition, long readLeng, int workerNumber, int workerSize, BinFileProgressListenerEx listener) {

      this.srcFile = srcFile;
      this.startPostion = startPosition;
      this.readLeng = readLeng;
      this.searchBytes = searchBytes;
      this.binFileProgressListener = listener;
      this.workerNumber = workerNumber;
      this.workerSize = workerSize;
    }

    private BinFileSearcher blockSearchWorker;

    public void cancel() {
      if (blockSearchWorker != null)
        blockSearchWorker.cancel();
    }

    public List call() throws Exception {

      blockSearchWorker = new BinFileSearcher();
      blockSearchWorker.setBufferSize(bufferSizePerWorker);
      blockSearchWorker.setSubThreadSize(subThreadSize);
      blockSearchWorker.setSubBufferSize(subBufferSize);

      if (this.binFileProgressListener != null) {
        blockSearchWorker.setBigFileProgressListener(new BinFileProgressListener() {

          @Override
          public void onProgress(List pointerList, float progress, float currentPosition, float startPosition, long maxSizeToRead) {
            binFileProgressListener.onProgress(workerNumber, workerSize, pointerList, progress);
          }
        });
      }

      final List pointerList = blockSearchWorker.searchPartially(srcFile, searchBytes, startPostion, readLeng);

      return pointerList;
    }
  }

  final static class ProgressCache {

    volatile float[] progress;
    volatile List[] pointerList;

    final List resultPointerList = new ArrayList();

    final boolean useResultCache;
    final int workerSize;

    @SuppressWarnings("unchecked")
    ProgressCache(int workerSize, boolean useResultCache) {
      this.workerSize = workerSize;
      this.progress = new float[workerSize];

      this.useResultCache = useResultCache;

      if (useResultCache) {
        this.pointerList = new CopyOnWriteArrayList[workerSize];
      }
    }

    synchronized void setProgress(int workerNumber, float progress, List pointerList) {
      this.progress[workerNumber] = progress;
      if (useResultCache) {
        if (this.pointerList[workerNumber] == null) {
          this.pointerList[workerNumber] = new CopyOnWriteArrayList();
        }
        this.pointerList[workerNumber].clear();
        this.pointerList[workerNumber].addAll(pointerList);
      }
    }

    // This is now only for realtime result callback interface.
    // So now not need to be thinking of multi thread but ready for multi
    // threading.
    List getResultPointers() {
      resultPointerList.clear();
      for (int i = 0; i < this.pointerList.length; i++) {
        resultPointerList.addAll(this.pointerList[i]);
      }
      // TODO add sort if needed
      return resultPointerList;
    }

    float getProgress() {
      float progress = 0;
      for (int i = 0; i < workerSize; i++) {
        progress += this.progress[i];
      }
      return progress / (float) workerSize;
    }
  }

  // Call from each worker thread
  private void onProgress(final int workerNumber, final int workerSize, final List pointerList, final float progress) {

    if (progressCache == null) {
      progressCache = new ProgressCache(workerSize, (onRealtimeResultListener != null));
    }

    progressCache.setProgress(workerNumber, progress, pointerList);

    if (onProgressListener != null) {
      onProgressListener.onProgress(progressCache.getProgress());
    }

    if (onRealtimeResultListener != null) {
      onRealtimeResultListener.onRealtimeResultListener(progressCache.getProgress(), progressCache.getResultPointers());
    }
  }

  /**
   * 
   * Get operation time in millis of last search
   * 
   * @return
   */
  public long getEllapsedMillis() {
    return _profile_lastEndTime - _profile_lastStartTime;
  }

  /**
   * Profiling method

   */
  public void _showProfile() {
    System.out.println("availableProcessors=" + Runtime.getRuntime().availableProcessors() + " free memory=" + getMegaBytes(Runtime.getRuntime().freeMemory()));
    System.out.println(
        "worker blockSize=" + getMegaBytes(blockSize) + " " +
            "worker buffer Size=" + getMegaBytes(bufferSizePerWorker) + ", " +
            "max num of thread=" + maxNumOfThreads + ", " +
            "sub buffer size=" + subBufferSize + "(B)" + ", " +
            "sub thread size=" + subThreadSize + ", ");

    System.out.println("possible max thread=" + maxNumOfThreads * subThreadSize + " " +
        "possible max memory=" + getMegaBytes(bufferSizePerWorker * maxNumOfThreads + (subBufferSize * subThreadSize)));
  }

  /**
   * Optimize threading and memory
   * 
   * @param fileLength
   */
  private void optimize(long fileLength) {

    final int availableProcessors = Runtime.getRuntime().availableProcessors();

    final long free = Runtime.getRuntime().freeMemory() / 2;

    int workerSize = availableProcessors / 2;
    if (workerSize < 2) {
      workerSize = 2;
    }

    long bufferSize = free / workerSize;
    if (bufferSize > 1 * 1024 * 1024) {
      bufferSize = 1 * 1024 * 1024;
    }

    long blockSize = fileLength / workerSize;
    if (blockSize > 1 * 1024 * 1024) {
      blockSize = 1 * 1024 * 1024;
    }
    int iBlockSize = (int) blockSize;

    if (bufferSize > blockSize) {
      bufferSize = blockSize;
    }
    int iBufferSize = (int) bufferSize;

    this.setBlockSize(iBlockSize);
    this.setMaxNumOfThreads(workerSize);
    this.setBufferSizePerWorker(iBufferSize);
    this.setSubBufferSize(256);

  }

  private String getMegaBytes(long valBytes) {
    return String.format("%.1f(MB)", ((float) valBytes / (1024f * 1024f)));
  }

}