All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.aparapi.examples.matrix.CorrMatrixHost Maven / Gradle / Ivy

/**
 * Copyright (c) 2016 - 2017 Syncleus, Inc.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
/**
 * This material was prepared as an account of work sponsored by an agency of the United States Government.  
 * Neither the United States Government nor the United States Department of Energy, nor Battelle, nor any of 
 * their employees, nor any jurisdiction or organization that has cooperated in the development of these materials, 
 * makes any warranty, express or implied, or assumes any legal liability or responsibility for the accuracy, 
 * completeness, or usefulness or any information, apparatus, product, software, or process disclosed, or represents
 * that its use would not infringe privately owned rights.
 */
package com.aparapi.examples.matrix;

import org.apache.log4j.Logger;

import com.aparapi.Kernel;
import com.aparapi.Kernel.EXECUTION_MODE;
import com.aparapi.Range;
import com.aparapi.device.Device;
import com.aparapi.device.OpenCLDevice;

/**
 * GPU calculations using OpenBitSet Intersection for OpenBitSets
 * 
 * Based on code from: 
* {@link http://grepcode.com/file/repo1.maven.org/maven2/org.apache.lucene/lucene-core/3.1.0/org/apache/lucene/util/BitUtil.java} * * @author ryan.lamothe at gmail.com * @author sedillard at gmail.com */ public class CorrMatrixHost { private static final Logger LOG = Logger.getLogger(CorrMatrixHost.class); /** * Perform matrix intersection for two lists of Lucene OpenBitSet-based packed longs * * @param matrixA * The first term-document matrix * @param matrixB * The second term-document matrix * @param Aparapi EXECUTION_MODE * @return result Matrix * @throws Exception */ public static int[][] intersectionMatrix(final long[][] matrixA, final long[][] matrixB, final EXECUTION_MODE executionMode) { // Basic validation if (matrixA == null) { throw new NullPointerException("MatrixA cannot be NULL"); } if (matrixB == null) { throw new NullPointerException("MatrixB cannot be NULL"); } // Size of an array is 8 bytes for the object + 4 bytes for the header and length information final int arrayMemOverhead = 12; // numDocs/64 since they are packed into longs // We need to make our matrix sizes multiples of BLOCK_SIZE final int matrixA_numTerms = matrixA.length; final int matrixA_numLongs = matrixA[0].length; if (LOG.isDebugEnabled()) { LOG.debug("----------"); LOG.debug("MatrixA NumTerms (Rows): " + matrixA_numTerms); LOG.debug("MatrixA NumLongs (Columns): " + matrixA_numLongs); LOG.debug("MatrixA NumDocs: " + (matrixA_numLongs * 64L)); } final long matrixA_BytesPerRow = matrixA_numLongs * 8L; final long matrixA_TotalBytes = (matrixA_numTerms * matrixA_BytesPerRow) + arrayMemOverhead; if (LOG.isDebugEnabled()) { LOG.debug("MatrixA Total Memory Size: " + humanReadableByteCount(matrixA_TotalBytes, true)); } final int matrixB_numTerms = matrixB.length; final int matrixB_numLongs = matrixB[0].length; if (LOG.isDebugEnabled()) { LOG.debug("----------"); LOG.debug("MatrixB NumTerms (Rows): " + matrixB_numTerms); LOG.debug("MatrixB NumLongs (Columns): " + matrixB_numLongs); LOG.debug("MatrixB NumDocs: " + (matrixB_numLongs * 64L)); } final long matrixB_BytesPerRow = matrixB_numLongs * 8L; final long matrixB_TotalBytes = (matrixB_numTerms * matrixB_BytesPerRow) + arrayMemOverhead; if (LOG.isDebugEnabled()) { LOG.debug("MatrixB Total Memory Size: " + humanReadableByteCount(matrixB_TotalBytes, true)); LOG.debug("----------"); } final int[][] resultMatrix = new int[matrixA_numTerms][matrixB_numTerms]; if (LOG.isDebugEnabled()) { final long resultMatrix_TotalBytes = (matrixA_numTerms * matrixB_numTerms * 4L) + arrayMemOverhead; LOG.debug("ResultMatrix Memory Size: " + humanReadableByteCount(resultMatrix_TotalBytes, true)); LOG.debug("Total Requested Memory Size: " + humanReadableByteCount(matrixA_TotalBytes + matrixB_TotalBytes + resultMatrix_TotalBytes, true)); LOG.debug("----------"); } int NUM_SUB_ROWS = matrixA_numTerms; // Default number of sub-rows OpenCLDevice device = null; // We do not test for EXECUTION_MODE.JTP because JTP is non-OpenCL if (executionMode.equals(EXECUTION_MODE.CPU)) { device = (OpenCLDevice) Device.firstCPU(); if (device == null) { LOG.warn("OpenCLDevice.CPU is NULL...OpenCL is unavailable. Setting to JTP mode."); LOG.debug("----------"); } } else if (executionMode.equals(EXECUTION_MODE.GPU)) { device = (OpenCLDevice) Device.best(); if (device == null) { LOG.warn("OpenCLDevice.GPU is NULL...OpenCL is unavailable. Setting to JTP mode."); LOG.debug("----------"); } } // This is to create stripes of rows that will fit into OpenCL's available memory // Calculate the number of sub-rows by calling OpenCL to find out available memory // Length of row * 8 (size of long in bytes) * number of rows to available memory final int maxNumTerms = Math.max(matrixA_numTerms, matrixB_numTerms); if (device != null) { final long globalMemSize = device.getGlobalMemSize(); // final long maxMemAllocSize = Math.max((globalMemSize/4), 128*1024*1024); final long maxMemAllocSize = device.getMaxMemAllocSize(); // 1048576 bytes in a megabyte (1024*1024) // Java long is 8 bytes // 131072 longs in 1 megabyte // SAFE OpenCL spec allocation is max(1/4 GlobalMemSize) // ***During our testing this appears to be incorrectly/inconsistently reported depending on os/drivers/hardware*** if (LOG.isDebugEnabled()) { LOG.debug("Available OpenCL globalMemSize: " + humanReadableByteCount(globalMemSize, true)); LOG.debug("Available OpenCL maxMemAllocSize: " + humanReadableByteCount(maxMemAllocSize, true)); } // Maybe there is a more clever way to do this :) // The idea here is to decide how many sub-rows of the matrix we can fit on a single card // The long-term goal to divide up the work for both small RAM GPUs and multiple GPUs int subRowsCounterA = 0; int subRowsCounterB = 0; long subRowsMemSizeA = 0L; long subRowsMemSizeB = 0L; long subResultMatrixMemSize = 0L; long subTotalMemSize = 0L; do { if (subRowsCounterA < matrixA_numTerms) { subRowsMemSizeA = subRowsCounterA != 0 ? (subRowsCounterA * matrixA_numLongs * 8L) + arrayMemOverhead : 0; subRowsCounterA += 1; } else if (subRowsCounterA == matrixA_numTerms) { subRowsMemSizeA = subRowsCounterA != 0 ? (subRowsCounterA * matrixA_numLongs * 8L) + arrayMemOverhead : 0; } if (subRowsCounterB < matrixB_numTerms) { subRowsMemSizeB = subRowsCounterB != 0 ? (subRowsCounterB * matrixB_numLongs * 8L) + arrayMemOverhead : 0; subRowsCounterB += 1; } else if (subRowsCounterB == matrixB_numTerms) { subRowsMemSizeB = subRowsCounterB != 0 ? (subRowsCounterB * matrixB_numLongs * 8L) + arrayMemOverhead : 0; } // This is 4 bytes since the sub-result matrix is an int array subResultMatrixMemSize = ((subRowsCounterA * subRowsCounterB) * 4L) + arrayMemOverhead; subTotalMemSize = subRowsMemSizeA + subRowsMemSizeB + subResultMatrixMemSize; } while ((Math.max(subRowsCounterA, subRowsCounterB) < maxNumTerms) && (subTotalMemSize <= maxMemAllocSize)); // If using OpenCL override the default number of subrows NUM_SUB_ROWS = Math.max(subRowsCounterA, subRowsCounterB); if (NUM_SUB_ROWS < maxNumTerms) { final long subMatrixA_memSize = (NUM_SUB_ROWS * matrixA_numLongs * 8L) + arrayMemOverhead; final long subMatrixB_memSize = (NUM_SUB_ROWS * matrixB_numLongs * 8L) + arrayMemOverhead; final long subResultMatrix_memSize = (NUM_SUB_ROWS * NUM_SUB_ROWS * 4L) + arrayMemOverhead; LOG.warn("****************************************************************"); LOG.warn("Requested matrix computation is larger than available OpenCL memory"); LOG.warn("Matrix striping is occurring to fit all data into OpenCL memory..."); LOG.warn(""); LOG.warn("Number rows requested: " + maxNumTerms); LOG.warn("Number rows that fit: " + NUM_SUB_ROWS); LOG.warn(""); LOG.warn("SubMatrixA Memory Size: " + humanReadableByteCount(subMatrixA_memSize, true)); LOG.warn("SubMatrixB Memory Size: " + humanReadableByteCount(subMatrixB_memSize, true)); LOG.warn("SubResultMatrix Memory Size: " + humanReadableByteCount(subResultMatrix_memSize, true)); LOG.warn("SubMatrix Total Memory Size: " + humanReadableByteCount(subMatrixA_memSize + subMatrixB_memSize + subResultMatrix_memSize, true)); LOG.warn("****************************************************************"); } } final int numSubBlocksA = ((matrixA_numTerms + NUM_SUB_ROWS) - 1) / NUM_SUB_ROWS; final int numSubBlocksB = ((matrixB_numTerms + NUM_SUB_ROWS) - 1) / NUM_SUB_ROWS; final long[] subMatrixA = new long[NUM_SUB_ROWS * matrixA_numLongs]; final long[] subMatrixB = new long[NUM_SUB_ROWS * matrixB_numLongs]; final int[] subResultMatrix = new int[NUM_SUB_ROWS * NUM_SUB_ROWS]; final CorrMatrixKernel kernel = new CorrMatrixKernel(subMatrixA, NUM_SUB_ROWS, subMatrixB, NUM_SUB_ROWS, matrixA_numLongs, subResultMatrix); kernel.setExplicit(true); // Here we define a fall-back strategy, since the user may have wanted to execute only a single execution mode if (executionMode.equals(EXECUTION_MODE.GPU) && (device != null)) { kernel.addExecutionModes(EXECUTION_MODE.GPU, EXECUTION_MODE.CPU, EXECUTION_MODE.JTP); LOG.debug("Execution Fallback Strategy: GPU --> CPU --> JTP"); } else if (executionMode.equals(EXECUTION_MODE.CPU) && (device != null)) { kernel.addExecutionModes(EXECUTION_MODE.CPU, EXECUTION_MODE.JTP); LOG.debug("Execution Fallback Strategy: CPU --> JTP"); } else { kernel.addExecutionModes(EXECUTION_MODE.JTP); LOG.debug("Execution Strategy: JTP"); } try { for (int a = 0; a < numSubBlocksA; a++) { for (int b = 0; b < numSubBlocksB; b++) { final int aSubRowStart = a * NUM_SUB_ROWS; final int aSubRowEnd = Math.min(matrixA_numTerms, aSubRowStart + NUM_SUB_ROWS); for (int i = aSubRowStart; i < aSubRowEnd; i++) { if (matrixA_numLongs != matrixA[i].length) { throw new IllegalStateException("All rows in the matrix need be the same length"); } System.arraycopy(matrixA[i], 0, subMatrixA, (i - aSubRowStart) * matrixA_numLongs, matrixA_numLongs); } final int bSubRowStart = b * NUM_SUB_ROWS; final int bSubRowEnd = Math.min(matrixB_numTerms, bSubRowStart + NUM_SUB_ROWS); for (int i = bSubRowStart; i < bSubRowEnd; i++) { if (matrixA_numLongs != matrixB[i].length) { throw new IllegalStateException("All rows in the matrix need be the same length"); } System.arraycopy(matrixB[i], 0, subMatrixB, (i - bSubRowStart) * matrixB_numLongs, matrixB_numLongs); } // Since matrixA_NumLongs == matrixB_NumLongs we're only going to pass matrixA_NumLongs executeKernel(device, subMatrixA, aSubRowEnd - aSubRowStart, subMatrixB, bSubRowEnd - bSubRowStart, matrixA_numLongs, subResultMatrix, kernel); // Convert one dimensional array to two dimensional array in the expected output ordering for (int i = 0; i < NUM_SUB_ROWS; i++) { if ((i + aSubRowStart) < aSubRowEnd) { System.arraycopy(subResultMatrix, i * NUM_SUB_ROWS, resultMatrix[i + aSubRowStart], bSubRowStart, bSubRowEnd - bSubRowStart); } } } } } finally { if (LOG.isDebugEnabled()) { LOG.debug("----------"); LOG.debug("Aparapi Gross Execution Time: " + kernel.getAccumulatedExecutionTime() + " ms <------ Aparapi"); LOG.debug("OpenCL Generation Time: " + kernel.getConversionTime() + " ms"); LOG.debug("Kernel Net Execution Time: " + (kernel.getAccumulatedExecutionTime() - kernel.getConversionTime()) + " ms"); LOG.debug("----------"); } try { kernel.dispose(); } catch (final UnsatisfiedLinkError e) { LOG.error("Aparapi failed to dispose of the kernel", e); } } return resultMatrix; } /** * Execute the GPU kernel * * @param subMatrixA * @param matrixA_NumTerms * @param subMatrixB * @param matrixB_NumTerms * @param numLongs * @param subResultMatrix * @param kernel * * @return resultMatrix */ private static void executeKernel(final Device device, final long[] subMatrixA, final int matrixA_NumTerms, final long[] subMatrixB, final int matrixB_NumTerms, final int numLongs, final int[] subResultMatrix, final Kernel kernel) { // Power of Two for best performance int matrixA_NumTermsRnd = matrixA_NumTerms; while (!isPowerOfTwo(matrixA_NumTermsRnd)) { matrixA_NumTermsRnd += 1; } int matrixB_NumTermsRnd = matrixB_NumTerms; while (!isPowerOfTwo(matrixB_NumTermsRnd)) { matrixB_NumTermsRnd += 1; } final Range range; if (device != null) { range = Range.create2D(device, matrixA_NumTermsRnd, matrixB_NumTermsRnd); } else { range = Range.create2D(matrixA_NumTermsRnd, matrixB_NumTermsRnd); } if (LOG.isDebugEnabled()) { LOG.debug("Range: " + range); } kernel.put(subMatrixA); kernel.put(subMatrixB); kernel.put(subResultMatrix); kernel.execute(range); kernel.get(subResultMatrix); } /** * Highly efficient means to compute whether a number is a power of 2
* Based on code from http://graphics.stanford.edu/~seander/bithacks.html#DetermineIfPowerOf2 *

* Another very cool way to do this is ((x&(-x))==x) * * @param n * @return boolean */ private static boolean isPowerOfTwo(int n) { return (n > 0) && ((n & (n - 1)) == 0); } /** * Rounds a number to the multiple indicated * * @param num * @param multiple * @return */ private static int roundToMultiple(double num, int multiple) { return (int) (Math.ceil(num / multiple) * multiple); } /** * Very nice means to convert byte sizes into human readable format
* Based on code from http://stackoverflow.com/questions/3758606/how-to-convert-byte-size-into-human-readable-format-in-java *

* * @param bytes * @param si * @return humanReadableByteCount */ private static String humanReadableByteCount(long bytes, boolean si) { final int unit = si ? 1000 : 1024; if (bytes < unit) { return bytes + " B"; } final int exp = (int) (Math.log(bytes) / Math.log(unit)); final String pre = (si ? "kMGTPE" : "KMGTPE").charAt(exp - 1) + (si ? "" : "i"); return String.format("%.1f %sB", bytes / Math.pow(unit, exp), pre); } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy