All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.github.steveash.jg2p.Grams Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2014 Steve Ash
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.github.steveash.jg2p;

import com.google.common.base.CharMatcher;
import com.google.common.base.Function;
import com.google.common.base.Splitter;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;

import com.github.steveash.jg2p.align.GramOptions;
import com.github.steveash.jg2p.util.NestedLoopPairIterable;

import org.apache.commons.lang3.tuple.Pair;

import java.util.List;

import static com.google.common.collect.Iterables.concat;
import static com.google.common.collect.Iterables.transform;
import static org.apache.commons.lang3.StringUtils.isBlank;

/**
 * Utility class for iterating over grams, lists of words X Y into grams, etc.
 *
 * @author Steve Ash
 */
public class Grams {
  public static final String EPSILON = "";
  private static final Splitter splitter = Splitter.on(CharMatcher.WHITESPACE).trimResults().omitEmptyStrings();

  private Grams() { }

  public static int countInGram(String gram) {
    if (isBlank(gram)) {
      return 0;
    }
    int count = 0;
    for (int i = 0; i < gram.length(); i++) {
      if (gram.charAt(i) == ' ') {
        count += 1;
      }
    }
    return count + 1; // gram count is number of spaces + 1
  }

  public static Iterable> wordPairsToAllGrams(Iterable> words,
                                                                             final GramOptions opts) {

    return concat(transform(words, new Function, Iterable>>() {
      @Override
      public Iterable> apply(Pair input) {
        return gramProduct(input.getLeft(), input.getRight(), opts);
      }
    }));
  }

  public static Iterable> gramProduct(Word xWord, Word yWord, GramOptions opts) {

    Iterable xGrams = xWord.gramsSizes(opts.getMinXGram(), opts.getMaxXGram());
    Iterable yGrams = yWord.gramsSizes(opts.getMinYGram(), opts.getMaxYGram());
    NestedLoopPairIterable pairs = NestedLoopPairIterable.of(xGrams, yGrams);

    Iterable> xEps = ImmutableList.of();
    Iterable> epsY = ImmutableList.of();

    if (opts.isIncludeXEpsilons()) {
      xEps = gramEpsilons(xGrams);
    }
    if (opts.isIncludeEpsilonYs()) {
      epsY = epsilonGrams(yGrams);
    }

    return concat(pairs, xEps, epsY);
  }

  public static Iterable> gramEpsilons(Iterable grams) {
    return transform(grams, toGramEps);
  }

  public static Iterable> epsilonGrams(Iterable grams) {
    return transform(grams, toEpsGram);
  }

  private static final Function> toGramEps = new Function>() {
    @Override
    public Pair apply(String input) {
      return Pair.of(input, EPSILON);
    }
  };

  private static final Function> toEpsGram = new Function>() {
    @Override
    public Pair apply(String input) {
      return Pair.of(EPSILON, input);
    }
  };

  /**
   * Takes an input list of grams and flattens to a list of unigrams; if all input grams
   * are unigrams then the resulting list is identical
   * @param grams
   * @return
   */
  public static List flattenGrams(List grams) {
    List result = Lists.newArrayListWithExpectedSize(grams.size());
    for (String gram : grams) {
      for (String uni : splitter.split(gram)) {
        result.add(uni);
      }
    }
    return result;
  }

  public static Iterable iterateSymbols(String gram) {
    return splitter.split(gram);
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy