All Downloads are FREE. Search and download functionalities are using the official Maven repository.

eu.interedition.collatex.suffixarray.SuffixArrays Maven / Gradle / Ivy

Go to download

A Java library for collating textual sources, for example, to produce an apparatus.

There is a newer version: 1.7.1
Show newest version
package eu.interedition.collatex.suffixarray;

import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;

/*
 * TODO: ultimately, this class should be "intelligent" enough to pick the best
 * algorithm, depending on the distribution and properties of the input (alphabet size,
 * symbols distribution, etc.).
 */

/**
 * 

* Factory-like methods for constructing suffix arrays for various data types. Whenever * defaults are provided, they aim to be sensible, "best guess" values for the given data * type. *

* Note the following important aspects that apply to nearly all methods in this class: *

    *
  • In nearly all cases, the returned suffix array will not be length-equal to the * input sequence (will be slightly larger). It is so because most algorithms use extra * space for end of sequence delimiters and it makes little sense to temporary duplicate * memory consumption just to have exact length counts.
  • *
* * @author Michał Nowak (Carrot Search) * @author Dawid Weiss (Carrot Search) * @author Anton Olsson for friprogramvarusyndikatet.se */ public final class SuffixArrays { /** * Maximum required trailing space in the input array (certain algorithms need it). */ final static int MAX_EXTRA_TRAILING_SPACE = DeepShallow.OVERSHOOT; /* * */ private SuffixArrays() { // no instances. } /** * Create a suffix array for a given character sequence with the default algorithm. */ public static int[] create(CharSequence s) { return create(s, defaultAlgorithm()); } /** * Create a suffix array for a given character sequence, using the provided suffix * array building strategy. */ public static int[] create(CharSequence s, ISuffixArrayBuilder builder) { return new CharSequenceAdapter(builder).buildSuffixArray(s); } /** * Create a suffix array and an LCP array for a given character sequence. * * @see #computeLCP(int[], int, int, int[]) */ public static SuffixData createWithLCP(CharSequence s) { return createWithLCP(s, defaultAlgorithm()); } /** * Create a suffix array and an LCP array for a given character sequence, use the * given algorithm for building the suffix array. * * @see #computeLCP(int[], int, int, int[]) */ public static SuffixData createWithLCP(CharSequence s, ISuffixArrayBuilder builder) { final CharSequenceAdapter adapter = new CharSequenceAdapter(builder); final int[] sa = adapter.buildSuffixArray(s); final int[] lcp = computeLCP(adapter.input, 0, s.length(), sa); return new SuffixData(sa, lcp); } /** * Create a suffix array and an LCP array for a given input sequence of symbols. */ public static SuffixData createWithLCP(int[] input, int start, int length) { final ISuffixArrayBuilder builder = new DensePositiveDecorator( new ExtraTrailingCellsDecorator(defaultAlgorithm(), 3)); return createWithLCP(input, start, length, builder); } /** * Create a suffix array and an LCP array for a given input sequence of symbols and a * custom suffix array building strategy. */ public static SuffixData createWithLCP(int[] input, int start, int length, ISuffixArrayBuilder builder) { final int[] sa = builder.buildSuffixArray(input, start, length); final int[] lcp = computeLCP(input, start, length, sa); return new SuffixData(sa, lcp); } /** * Create a suffix array and an LCP array for a given generic array and a * custom suffix array building strategy, using the given T object * comparator. */ public static SuffixData createWithLCP(T[] input, ISuffixArrayBuilder builder, Comparator comparator) { final GenericArrayAdapter adapter = new GenericArrayAdapter(builder, comparator); final int[] sa = adapter.buildSuffixArray(input); final int[] lcp = computeLCP(adapter.input, 0, input.length, sa); return new SuffixData(sa, lcp); } /** * Calculate longest prefix (LCP) array for an existing suffix array and input. Index * i of the returned array indicates the length of the common prefix * between suffix i and i-1. The 0-th * index has a constant value of -1. *

* The algorithm used to compute the LCP comes from * T. Kasai, G. Lee, H. Arimura, S. Arikawa, and K. Park. Linear-time longest-common-prefix * computation in suffix arrays and its applications. In Proc. 12th Symposium on Combinatorial * Pattern Matching (CPM ’01), pages 181–192. Springer-Verlag LNCS n. 2089, 2001. */ public static int[] computeLCP(int[] input, final int start, final int length, int[] sa) { final int[] rank = new int[length]; for (int i = 0; i < length; i++) rank[sa[i]] = i; int h = 0; final int[] lcp = new int[length]; for (int i = 0; i < length; i++) { int k = rank[i]; if (k == 0) { lcp[k] = -1; } else { final int j = sa[k - 1]; while (i + h < length && j + h < length && input[start + i + h] == input[start + j + h]) { h++; } lcp[k] = h; } if (h > 0) h--; } return lcp; } /** * @return Return a new instance of the default algorithm for use in other methods. At * the moment {@link QSufSort} is used. */ private static ISuffixArrayBuilder defaultAlgorithm() { return new QSufSort(); } /** * Utility method converting all suffixes of a given sequence to a list of strings. */ public static List toString(CharSequence input, int[] suffixes) { final String full = input.toString(); final ArrayList result = new ArrayList<>(); for (int i = 0; i < input.length(); i++) { result.add(full.subSequence(suffixes[i], full.length())); } return result; } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy