All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.github.steveash.jg2p.util.TokenSeqUtil Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2014 Steve Ash
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.github.steveash.jg2p.util;

import com.google.common.base.Function;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;

import com.github.steveash.jg2p.Grams;
import com.github.steveash.jg2p.phoseq.Graphemes;

import java.util.List;

import javax.annotation.Nullable;

import cc.mallet.types.Token;

import static com.google.common.base.CharMatcher.WHITESPACE;

/**
 * @author Steve Ash
 */
public class TokenSeqUtil {

  public static int countBefore(List grams, int current) {
    int count = 0;
    for (int i = 0; i < grams.size() && i < current; i++) {
      count += Grams.countInGram(grams.get(i));
    }
    return count;
  }

  public static int countAfter(List grams, int current) {
    int count = 0;
    for (int i = current + 1; i < grams.size(); i++) {
      count += Grams.countInGram(grams.get(i));
    }
    return count;
  }

  @Nullable
  public static String getWindow(List ts, int current, int windowOffset, int windowWidth) {
    List ss = Lists.transform(ts, tokenToString);
    return getWindowFromStrings(ss, current, windowOffset, windowWidth);
  }

  // this skips over spaces so it doesn't matter if gram strings are space separated or not
  public static String getWindowFromStrings(List ts, int current, int windowOffset, int windowWidth) {
    if (windowOffset < 0) {
      return getBakwardWindowFromString(ts, current, windowOffset, windowWidth);
    }
    return getForwardWindowFromString(ts, current, windowOffset, windowWidth);
  }

  private static String getBakwardWindowFromString(List ts, int current, int windowOffset, int windowWidth) {
    Preconditions.checkArgument(windowOffset < 0);
    Preconditions.checkArgument(windowOffset + windowWidth <= 0);

    int start = -(windowOffset + windowWidth);

    StringBuilder sb = new StringBuilder(windowWidth);
    int strIndex = -1;
    String str = "";
    int eaten = 0;
    while (true) {
      if (strIndex < 0) {
        findnext:
        while (true) {
          current -= 1;
          if (current < 0) {
            return null; // ran out of chars to eat
          }
          str = ts.get(current);
          if (str.length() > 0) {
            break findnext;
          }
        }
        strIndex = str.length() - 1;
      }

      char c = str.charAt(strIndex);
      strIndex -= 1;
      if (c == ' ') continue;

      if (eaten >= start) {
        sb.append(c);
        if (sb.length() == windowWidth) {
          return sb.reverse().toString();
        }
      }
      eaten += 1;
    }
  }

  private static String getForwardWindowFromString(List ts, int current, int windowOffset, int windowWidth) {
    Preconditions.checkArgument(windowOffset > 0);
    Preconditions.checkArgument(windowOffset + windowWidth > 0);

    int start = windowOffset - 1; // we're starting one character over from us, to be symmetric needs to be shifted

    StringBuilder sb = new StringBuilder(windowWidth);
    int strIndex = 1;
    String str = "";
    int eaten = 0;
    while (true) {
      if (strIndex >= str.length()) {
        findnext:
        while (true) {
          current += 1;
          if (current > (ts.size() - 1)) {
            return null; // ran out of chars to eat
          }
          str = ts.get(current);
          if (str.length() > 0) {
            break findnext;
          }
        }
        strIndex = 0;
      }

      char c = str.charAt(strIndex);
      strIndex += 1;
      if (c == ' ') continue;

      if (eaten >= start) {
        sb.append(c);
        if (sb.length() == windowWidth) {
          return sb.toString();
        }
      }
      eaten += 1;
    }
  }

  public static final Function tokenToString = new Function() {
    @Override
    public String apply(Token input) {
      return input.getText();
    }
  };

  public static String convertShape(String winStr) {
    if (winStr == null) return null;
    StringBuilder sb = new StringBuilder(winStr.length());
    for (int i = 0; i < winStr.length(); i++) {
      char c = winStr.charAt(i);
      if (Graphemes.consonants.matches(c)) {
        sb.append('c');
      } else if (Graphemes.vowels.matches(c)) {
        sb.append('v');
      } else if (WHITESPACE.matches(c)) {
        sb.append('s');
      } else {
        sb.append('p');
      }
    }
    return sb.toString();
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy