All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.github.steveash.jg2p.rerank.VowelReplacer Maven / Gradle / Ivy

The newest version!
/*
 * Copyright 2015 Steve Ash
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.github.steveash.jg2p.rerank;

import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableListMultimap;
import com.google.common.collect.ImmutableMultimap;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;

import com.github.steveash.jg2p.PhoneticEncoder;
import com.github.steveash.jg2p.phoseq.Phonemes;

import java.util.List;
import java.util.Map;

/**
 * Generates new candidate encodings for particular entries just by replacing particular vowels with frequently confused
 * vowels
 *
 * @author Steve Ash
 */
public class VowelReplacer {

  private static final int REPLACE_TOP_K = 4;
  private static final ImmutableMultimap replaces = ImmutableListMultimap.builder()
      .put("AH", "AA")
      .put("AH", "AE")
      .put("AH", "IH")
      .put("AA", "AH")
      .put("AE", "AA")
      .put("AH", "EH")
      .put("IH", "AH")
      .put("AE", "AH")
      .put("AA", "AE")
      .put("EH", "AH")
      .put("OW", "AA")
      .put("AA", "OW")
      .put("IH", "IY")
      .put("AH", "IY")
      .put("AH", "OW")
      .put("IH", "AY")
      .put("OW", "AH")
      .build();

  public List updateResults(List original) {
    // updates the top k (defaults k = 4) entries by adding similar vowel versions
    Map, PhoneticEncoder.Encoding> results = null;
    for (int i = 0; i < original.size() && i < REPLACE_TOP_K; i++) {
      List emitted = emit(original.get(i));
      if (emitted.isEmpty()) {
        continue;
      }
      if (results == null) {
        results = Maps.newHashMap();
        for (PhoneticEncoder.Encoding encoding : original) {
          results.put(encoding.getPhones(), encoding);
        }
      }
      // add these if they aren't alreaday present
      for (PhoneticEncoder.Encoding emittedEncoding : emitted) {
        if (!results.containsKey(emittedEncoding.getPhones())) {
          results.put(emittedEncoding.getPhones(), emittedEncoding);
        }
      }
    }
    if (results == null) {
      return original;
    }
    return PhoneticEncoder.OrderByTagScore.sortedCopy(results.values());
  }

  /**
   * This returns any replacements that can be made for the given input
   */
  public List emit(PhoneticEncoder.Encoding input) {
    // find the first vowel or second vowel -- that is in the replacement list and return that
    int vowelCount = 0;
    List results = null;
    for (int i = 0; i < input.getPhones().size(); i++) {
      String p = input.getPhones().get(i);
      if (Phonemes.isVowel(p)) {
        vowelCount += 1;
        if (vowelCount > 2) {
          break;
        }
        for (String newPhone : replaces.get(p)) {
          if (results == null) {
            results = Lists.newArrayList();
          }
          results.add(input.withReplacedPhoneme(i, newPhone));
        }
      }
    }
    if (results == null) return ImmutableList.of();
    return results;
  }
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy