All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.simmetrics.tokenizers.QGram Maven / Gradle / Ivy

There is a newer version: 4.1.1
Show newest version
/*
 * SimMetrics - SimMetrics is a java library of Similarity or Distance Metrics,
 * e.g. Levenshtein Distance, that provide float based similarity measures
 * between String Data. All metrics return consistent measures rather than
 * unbounded similarity scores.
 * 
 * Copyright (C) 2014 SimMetrics authors
 * 
 * This file is part of SimMetrics. This program is free software: you can
 * redistribute it and/or modify it under the terms of the GNU General Public
 * License as published by the Free Software Foundation, either version 3 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
 * details.
 * 
 * You should have received a copy of the GNU General Public License along with
 * SimMetrics. If not, see .
 */

package org.simmetrics.tokenizers;

import static com.google.common.base.Preconditions.checkArgument;
import static java.util.Collections.emptyList;
import static java.util.Collections.singletonList;

import java.util.ArrayList;
import java.util.List;

/**
 * Basic Q-Gram tokenizer for a variable q. Returns a list with the original
 * input for tokens shorter then q.
 * 

* This class is immutable and thread-safe. * */ public class QGram extends AbstractTokenizer { private final int q; /** * Constructs a q-gram tokenizer with the given q. * * @param q * size of the tokens */ public QGram(int q) { checkArgument(q > 0, "q must be greater then 0"); this.q = q; } /** * Returns the q of this tokenizer. * * @return the q of this tokenizer */ public int getQ() { return q; } @Override public List tokenizeToList(final String input) { if (input.isEmpty()) { return emptyList(); } if (input.length() <= q) { return singletonList(input); } final List ret = new ArrayList<>(input.length()); for (int i = 0; i < input.length() - q + 1; i++) { ret.add(input.substring(i, i + q)); } return ret; } @Override public String toString() { return "QGramTokenizer [q=" + q + "]"; } }





© 2015 - 2025 Weber Informatics LLC | Privacy Policy