All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.github.steveash.jg2p.align.InputReader Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2014 Steve Ash
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.github.steveash.jg2p.align;

import com.google.common.base.CharMatcher;
import com.google.common.base.Charsets;
import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import com.google.common.io.CharSource;
import com.google.common.io.Files;
import com.google.common.io.LineProcessor;
import com.google.common.io.Resources;

import com.github.steveash.jg2p.Grams;
import com.github.steveash.jg2p.Word;
import com.github.steveash.jg2p.syll.SWord;

import org.apache.commons.lang3.StringUtils;

import java.io.File;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import static org.apache.commons.lang3.StringUtils.isBlank;
import static org.apache.commons.lang3.StringUtils.isNotBlank;
import static sun.management.snmp.jvminstr.JvmThreadInstanceEntryImpl.ThreadStateMap.Byte1.other;

/**
 * @author Steve Ash
 */
public class InputReader {

  private static final Splitter tabSplit = Splitter.on('\t').trimResults();

  public static interface LineReader {

    /**
     * Return the input record or null if this line shouldnt be included
     */
    InputRecord parse(String line);
  }

  public static InputReader makeDefaultFormatReader() {
    return new InputReader(defaultLineReader);
  }

  public static InputReader makePSaurusReader() {
    return new InputReader(psaurusLineReader);
  }

  public static InputReader makeCmuReader() {
    return new InputReader(cmuReader);
  }

  private final LineReader reader;

  public InputReader() {
    this.reader = defaultLineReader;
  }

  public InputReader(LineReader reader) {
    this.reader = reader;
  }

  public List readFromClasspath(String resource) throws IOException {
    return read(Resources.asCharSource(Resources.getResource(resource), Charsets.UTF_8));
  }

  public List readFromFile(File input) throws IOException {
    return read(Files.asCharSource(input, Charsets.UTF_8));
  }

  public List read(CharSource source) throws IOException {
    return source.readLines(new LineProcessor>() {
      private final List recs = Lists.newArrayList();

      @Override
      public boolean processLine(String line) throws IOException {
        if (isBlank(line)) {
          return true;
        }
        if (isComment(line)) {
          return true;
        }

        InputRecord maybe = reader.parse(line);
        if (maybe != null) {
          recs.add(maybe);
        }
        return true;
      }

      @Override
      public List getResult() {
        return recs;
      }
    });
  }

  private boolean isComment(String line) {
    return line.startsWith(";;");
  }

  private static LineReader defaultLineReader = new LineReader() {
    @Override
    public InputRecord parse(String line) {
      Iterator iter = tabSplit.split(line).iterator();
      Word x = Word.fromSpaceSeparated(iter.next());
      Word y = Word.fromSpaceSeparated(iter.next());
      return new InputRecord(x, y);
    }
  };

  private static LineReader psaurusLineReader = new LineReader() {
    @Override
    public InputRecord parse(String line) {
      Iterator iter = tabSplit.split(line).iterator();
      Word x = Word.fromNormalString(iter.next());
      String phones = iter.next();
      Word y;
      if (iter.hasNext()) {
        String sylls = iter.next();
        if (iter.hasNext()) {
          String stress = iter.next();
          y = new SWord(phones, sylls, stress);
        } else {
          y = new SWord(phones, sylls);
        }
      } else {
        y = Word.fromSpaceSeparated(phones);
      }
      return new InputRecord(x, y);
    }
  };

  private static final Pattern xChars = Pattern.compile("([A-Z][A-Z']+)(\\(\\d+\\))?", Pattern.CASE_INSENSITIVE);
  private static LineReader cmuReader = new LineReader() {
    @Override
    public InputRecord parse(String line) {
      line = line.toUpperCase();
      // im going to ignore all of the "spelled out" versions of punctuation

      int split = line.indexOf("  ");
      String x = line.substring(0, split);
      Matcher xGood = xChars.matcher(x);
      if (!xGood.matches()) {
        return null;
      }
//      if (isNotBlank(xGood.group(2))) {
//        return null;
//      }
      x = xGood.group(1);

      String y = line.substring(split + 2);
      String phonesOnly = CharMatcher.DIGIT.removeFrom(y);
      if (x.charAt(x.length() - 1) == '\'') {
        x = x.substring(0, x.length() - 1);
      }
      List stresses = Lists.newArrayList();
      for (String s : Grams.iterateSymbols(y)) {
        String onlyDigits = CharMatcher.DIGIT.retainFrom(s);
        if (StringUtils.isBlank(onlyDigits)) {
          stresses.add(-1);
        } else {
          stresses.add(Integer.parseInt(onlyDigits));
        }
      }

      Word xx = Word.fromNormalString(x);
      Word yy = Word.fromSpaceSeparated(phonesOnly);
      return new InputRecord(xx, yy, stresses);
    }
  };
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy