org.apfloat.internal.IntMatrixStrategy Maven / Gradle / Ivy
Show all versions of apfloat Show documentation
package org.apfloat.internal;
import org.apfloat.ApfloatContext;
import org.apfloat.ApfloatRuntimeException;
import org.apfloat.spi.MatrixStrategy;
import org.apfloat.spi.ArrayAccess;
import org.apfloat.spi.Util;
/**
* Optimized matrix transposition methods for the int
type.
* The matrix transposition algorithm isn't parallelized.
*
* While the matrix transposition algorithm could easily be parallelized,
* on an SMP machine it does not make any sense. If the matrix doesn't fit
* in any processor specific cache then the memory (or higher level
* shared cache) bandwidth becomes a bottleneck in the algorithm. Matrix
* transposition is in principle a very simple algorithm - it doesn't do
* anything else than move data from one place to another. If shared memory
* is the bottleneck, then the algorithm isn't any faster if the data is being
* moved around by one thread or by multiple threads in parallel.
*
* If the data fits in a processor specific cache, then the algorithm could
* theoretically be made faster with parallelization. To make the parallelization
* effective however, the data would have to be set up in some kind of a NUMA
* way. For example, each processor core would hold an equal section of
* the data in the processor cache. Then the algorithm could be made faster
* as each processor core could quickly transpose blocks of data that are in the
* processor cache, and then exchange blocks with other processor cores via the
* slower higher level shared cache or main memory.
*
* This approach doesn't work well in practice however, at least not in a Java
* program. The reason is that there are no guarantees where the data is when
* the algorithm starts (in which processor core caches), and further there are
* no guarantees of any processor affinity for the threads that are executing
* in parallel. Different processor cores could be executing the transposition
* of different sections of the data at any moment, depending on how the
* operating system (and the JVM) schedule thread execution. And more often
* than not, the operating system isn't smart enough to apply any such processor
* affinity for the threads.
*
* An additional problem for any NUMA based attempt is that the data array would
* have to be aligned on a cache line (e.g. 64 or 128 bytes), to prevent
* cache contention at the edges of each data section. But a JVM makes no such
* guarantees about memory alignment. And since pointers do not exist in Java,
* manually aligning memory addresses isn't possible.
*
* Considering all of the above, the parallel algorithm doesn't in practice work
* any faster than the single-thread algorithm, as the algorithm is bound by the
* memory bandwidth (or shared cache bandwidth). In some cases parallelization
* can even make the execution slower due to increased cache contention.
*
* @since 1.7.0
* @version 1.7.0
* @author Mikko Tommila
*/
public class IntMatrixStrategy
implements MatrixStrategy
{
/**
* Default constructor.
*/
public IntMatrixStrategy()
{
}
/**
* Transpose a n1 x n2 matrix.
*
* Both n1 and n2 must be powers of two.
* Additionally, one of these must be true:
*
* n1 = n2
* n1 = 2*n2
* n2 = 2*n1
*
* @param arrayAccess Accessor to the matrix data. This data will be transposed.
* @param n1 Number of rows in the matrix.
* @param n2 Number of columns in the matrix.
*/
public void transpose(ArrayAccess arrayAccess, int n1, int n2)
throws ApfloatRuntimeException
{
int[] data = arrayAccess.getIntData();
int offset = arrayAccess.getOffset();
if (n1 != (n1 & -n1) ||
n2 != (n2 & -n2) ||
n1 <= 0 || n2 <= 0)
{
throw new ApfloatInternalException("Matrix size must be a power of two, not " + n1 + " x " + n2);
}
if (n1 == n2)
{
// Simply transpose
transposeSquare(data, offset, n1, n1);
}
else if (n2 == 2 * n1)
{
// First transpose two n1 x n1 blocks
transposeSquare(data, offset, n1, n2);
transposeSquare(data, offset + n1, n1, n2);
// Then permute the rows to correct order
permuteToHalfWidth(data, offset, n1, n2);
}
else if (n1 == 2 * n2)
{
// First permute the rows to correct order
permuteToDoubleWidth(data, offset, n1, n2);
// Then transpose two n2 x n2 blocks
transposeSquare(data, offset, n2, n1);
transposeSquare(data, offset + n2, n2, n1);
}
else
{
throw new ApfloatInternalException("Must be n1 = n2, n1 = 2*n2 or n2 = 2*n1; matrix is " + n1 + " x " + n2);
}
}
/**
* Transpose a square n1 x n1 block of n1 x n2 matrix.
*
* Both n1 and n2 must be powers of two,
* and n1 <= n2.
*
* @param arrayAccess Accessor to the matrix data. This data will be transposed.
* @param n1 Number of rows and columns in the block to be transposed.
* @param n2 Number of columns in the matrix.
*/
public void transposeSquare(ArrayAccess arrayAccess, int n1, int n2)
throws ApfloatRuntimeException
{
transposeSquare(arrayAccess.getIntData(), arrayAccess.getOffset(), n1, n2);
}
/**
* Permute the rows of the n1 x n2 matrix so that it is shaped like a
* n1/2 x 2*n2 matrix. Logically, the matrix is split in half, and the
* lower half is moved to the right side of the upper half.
*
* Both n1 and n2 must be powers of two,
* and n1 >= 2.
*
* E.g. if the matrix layout is originally as follows:
*
*
* 0 1 2 3
*
*
* 4 5 6 7
*
*
* 8 9 10 11
*
*
* 12 13 14 15
*
*
*
* Then after this method it is as follows:
*
*
* 0 1 2 3 8 9 10 11
*
*
* 4 5 6 7 12 13 14 15
*
*
*
* @param arrayAccess Accessor to the matrix data. This data will be permuted.
* @param n1 Number of rows in the matrix.
* @param n2 Number of columns in the matrix.
*
* @since 1.7.0
*/
public void permuteToDoubleWidth(ArrayAccess arrayAccess, int n1, int n2)
throws ApfloatRuntimeException
{
if (n1 != (n1 & -n1) ||
n2 != (n2 & -n2) ||
n1 <= 0 || n2 <= 0)
{
throw new ApfloatInternalException("Matrix size must be a power of two, not " + n1 + " x " + n2);
}
if (n1 < 2)
{
throw new ApfloatInternalException("Matrix height must be at least 2.");
}
permuteToDoubleWidth(arrayAccess.getIntData(), arrayAccess.getOffset(), n1, n2);
}
/**
* Permute the rows of the n1 x n2 matrix so that it is shaped like a
* 2*n1 x n2/2 matrix. Logically, the matrix is split in half, and the
* right half is moved below the left half.
*
* Both n1 and n2 must be powers of two.
*
* E.g. if the matrix layout is originally as follows:
*
*
* 0 1 2 3 4 5 6 7
*
*
* 8 9 10 11 12 13 14 15
*
*
*
* Then after this method it is as follows:
*
*
* 0 1 2 3
*
*
* 8 9 10 11
*
*
* 4 5 6 7
*
*
* 12 13 14 15
*
*
*
* @param arrayAccess Accessor to the matrix data. This data will be permuted.
* @param n1 Number of rows in the matrix.
* @param n2 Number of columns in the matrix.
*
* @since 1.7.0
*/
public void permuteToHalfWidth(ArrayAccess arrayAccess, int n1, int n2)
throws ApfloatRuntimeException
{
if (n1 != (n1 & -n1) ||
n2 != (n2 & -n2) ||
n1 <= 0 || n2 <= 0)
{
throw new ApfloatInternalException("Matrix size must be a power of two, not " + n1 + " x " + n2);
}
permuteToHalfWidth(arrayAccess.getIntData(), arrayAccess.getOffset(), n1, n2);
}
// Move a b x b block from source to dest
private static void moveBlock(int[] source, int sourceOffset, int sourceWidth, int[] dest, int destOffset, int destWidth, int b)
{
for (int i = 0; i < b; i++)
{
System.arraycopy(source, sourceOffset, dest, destOffset, b);
destOffset += destWidth;
sourceOffset += sourceWidth;
}
}
// Transpose two b x b blocks of matrix with specified width
// data based on offset1 is accessed in columns, data based on offset2 in rows
private static void transpose2blocks(int[] data, int offset1, int offset2, int width, int b)
{
for (int i = 0, position1 = offset2; i < b; i++, position1 += width)
{
for (int j = 0, position2 = offset1 + i; j < b; j++, position2 += width)
{
int tmp = data[position1 + j];
data[position1 + j] = data[position2];
data[position2] = tmp;
}
}
}
// Transpose a b x b block of matrix with specified width
private static void transposeBlock(int[] data, int offset, int width, int b)
{
for (int i = 0, position1 = offset; i < b; i++, position1 += width)
{
for (int j = i + 1, position2 = offset + j * width + i; j < b; j++, position2 += width)
{
int tmp = data[position1 + j];
data[position1 + j] = data[position2];
data[position2] = tmp;
}
}
}
// Transpose a square n1 x n1 block of n1 x n2 matrix in b x b blocks
private static void transposeSquare(int[] data, int offset, int n1, int n2)
{
ApfloatContext ctx = ApfloatContext.getContext();
int cacheBurstBlockSize = Util.round2down(ctx.getCacheBurst() / 8), // Cache burst in ints
cacheBlockSize = Util.sqrt4down(ctx.getCacheL1Size() / 8), // Transpose block size b that fits in processor L1 cache
cacheTreshold = Util.round2down(ctx.getCacheL2Size() / 8); // Size of matrix that fits in L2 cache
if (n1 <= cacheBurstBlockSize || n1 <= cacheBlockSize)
{
// Whole matrix fits in L1 cache
transposeBlock(data, offset, n2, n1);
}
else if (n1 * n2 <= cacheTreshold)
{
// Whole matrix fits in L2 cache (but not in L1 cache)
// Sometimes the first algorithm (the block above) is faster, if your L2 cache is very fast
int b = cacheBurstBlockSize;
for (int i = 0, position1 = offset; i < n1; i += b, position1 += b * n2)
{
transposeBlock(data, position1 + i, n2, b);
for (int j = i + b, position2 = offset + j * n2 + i; j < n1; j += b, position2 += b * n2)
{
transpose2blocks(data, position1 + j, position2, n2, b);
}
}
}
else
{
// Whole matrix doesn't fit in L2 cache
// This algorithm works fastest if L1 cache size is set correctly
int b = cacheBlockSize;
int[] tmp1 = new int[b * b],
tmp2 = new int[b * b];
for (int i = 0, position1 = offset; i < n1; i += b, position1 += b * n2)
{
moveBlock(data, position1 + i, n2, tmp1, 0, b, b);
transposeBlock(tmp1, 0, b, b);
moveBlock(tmp1, 0, b, data, position1 + i, n2, b);
for (int j = i + b, position2 = offset + j * n2 + i; j < n1; j += b, position2 += b * n2)
{
moveBlock(data, position1 + j, n2, tmp1, 0, b, b);
transposeBlock(tmp1, 0, b, b);
moveBlock(data, position2, n2, tmp2, 0, b, b);
transposeBlock(tmp2, 0, b, b);
moveBlock(tmp2, 0, b, data, position1 + j, n2, b);
moveBlock(tmp1, 0, b, data, position2, n2, b);
}
}
}
}
// Permute the rows of matrix to correct order, to make the n1 x n2 matrix half as wide (2*n1 x n2/2)
private static void permuteToHalfWidth(int[] data, int offset, int n1, int n2)
{
if (n1 < 2)
{
return;
}
int twicen1 = 2 * n1;
int halfn2 = n2 / 2;
int[] tmp = new int[halfn2];
boolean[] isRowDone = new boolean[twicen1];
int j = 1;
do
{
int o = j,
m = j;
System.arraycopy(data, offset + halfn2 * m, tmp, 0, halfn2);
isRowDone[m] = true;
m = (m < n1 ? 2 * m : 2 * (m - n1) + 1);
while (m != j)
{
isRowDone[m] = true;
System.arraycopy(data, offset + halfn2 * m, data, offset + halfn2 * o, halfn2);
o = m;
m = (m < n1 ? 2 * m : 2 * (m - n1) + 1);
}
System.arraycopy(tmp, 0, data, offset + halfn2 * o, halfn2);
while (isRowDone[j])
{
j++;
}
} while (j < twicen1 - 1);
}
// Permute the rows of matrix to correct order, to make the n1 x n2 matrix twice as wide (n1/2 x 2*n2)
private static void permuteToDoubleWidth(int[] data, int offset, int n1, int n2)
{
if (n1 < 4)
{
return;
}
int halfn1 = n1 / 2;
int[] tmp = new int[n2];
boolean[] isRowDone = new boolean[n1];
int j = 1;
do
{
int o = j,
m = j;
System.arraycopy(data, offset + n2 * m, tmp, 0, n2);
isRowDone[m] = true;
m = ((m & 1) != 0 ? m / 2 + halfn1 : m / 2);
while (m != j)
{
isRowDone[m] = true;
System.arraycopy(data, offset + n2 * m, data, offset + n2 * o, n2);
o = m;
m = ((m & 1) != 0 ? m / 2 + halfn1 : m / 2);
}
System.arraycopy(tmp, 0, data, offset + n2 * o, n2);
while (isRowDone[j])
{
j++;
}
} while (j < n1 - 1);
}
}