io.bdrc.lucene.sa.CmdParser Maven / Gradle / Ivy
Show all versions of lucene-sa Show documentation
/*******************************************************************************
* Copyright (c) 2017 Buddhist Digital Resource Center (BDRC)
*
* If this file is a derivation of another work the license header will appear
* below; otherwise, this work is licensed under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with the
* License.
*
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
package io.bdrc.lucene.sa;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.TreeMap;
import java.util.TreeSet;
/**
* Parses cmds from the total Trie {@code total_output.txt} and reconstructs
* the sandhied substring found between the two words undergoing sandhi.
*
* Is used by {@link SkrtWordTokenizer#reconstructLemmas}
*
* @author Hélios Hildt
*
*/
public class CmdParser {
private String[] t = new String[2]; // temporary variable constantly reused
private int sandhiType = -1;
private Integer pos = null;
private String entry = null;
private String[] initials = null;
private String diffInitial = null; // there can only be one initial diff
private String[] diffFinals = null;
// for sandhied (HashMap key)
private Integer toDelete = null;
private String initialCharsSandhied = null;
// for unsandhied (HashMap value)
private String initialCharsOriginal = null;
private String toAdd = null;
private TreeMap> sandhis = null;
private int idempotentGroup = -1;
private static final HashMap> idempotentInitials = new HashMap>()
{
private static final long serialVersionUID = 1L;
{
// vowels
put(1, Arrays.asList("F", "x", "X", "k", "K", "g", "G", "N", "c", "C", "j", "J", "Y",
"w", "W", "q", "L", "Q", "|", "R", "t", "T", "d", "D", "n", "p", "P", "b", "B", "m",
"y", "r", "l", "v", "S", "z", "s", "h", "H", "Z", "V"));
// consonants1
put(2, Arrays.asList("a", "A", "i", "I", "u", "U", "f", "F", "x", "X", "e", "E", "o", "O",
"k", "K", "g", "G", "N", "c", "C", "j", "J", "Y", "w", "W", "q", "L", "Q", "|", "R",
"t", "T", "d", "D", "n", "p", "P", "b", "B", "m", "H", "Z", "V"));
// consonants2
put(3, Arrays.asList("a", "A", "i", "I", "u", "U", "f", "F", "x", "X", "e", "E", "o", "O",
"N", "Y", "L", "|", "R", "y", "r", "l", "v", "S", "z", "s", "h", "H", "Z", "V"));
// cC_words
put(4, Arrays.asList("a", "A", "i", "I", "u", "U", "f", "F", "x", "X", "e", "E", "o", "O",
"k", "K", "g", "G", "N", "c", "j", "J", "Y", "w", "W", "q", "L", "Q", "|", "R",
"t", "T", "d", "D", "n", "p", "P", "b", "B", "m", "y", "r", "l", "v", "S", "z", "s",
"h", "H", "Z", "V"));
// consonants1_vowels
put(5, Arrays.asList("F", "x", "X", "k", "K", "g", "G", "N", "c", "C", "j", "J", "Y",
"w", "W", "q", "L", "Q", "|", "R", "t", "T", "d", "D", "n", "p", "P", "b", "B", "m",
"y", "r", "l", "v", "S", "z", "s", "h", "H", "Z", "V"));
// visarga1
put(6, Arrays.asList("F", "x", "X", "k", "K", "g", "G", "N", "c", "C", "j", "J", "Y",
"w", "W", "q", "L", "Q", "|", "R", "t", "T", "d", "D", "n", "p", "P", "b", "B", "m",
"H", "Z", "V"));
// visarga2
put(7, Arrays.asList("a", "A", "i", "I", "u", "U", "f", "F", "x", "X", "e", "E", "o", "O",
"N", "Y", "L", "|", "R", "y", "r", "l", "v", "S", "z", "s", "h", "H", "Z", "V"));
// punar
put(8, Arrays.asList("I", "F", "x", "X", "L", "|", "H", "Z", "V"));
// all SLP
put(9, Arrays.asList("a", "A", "i", "I", "u", "U", "f", "F", "x", "X", "e", "E", "o", "O",
"k", "K", "g", "G", "N", "c", "C", "j", "J", "Y", "w", "W", "q", "L", "Q", "|", "R",
"t", "T", "d", "D", "n", "p", "P", "b", "B", "m", "y", "r", "l", "v", "S", "z", "s",
"h", "H", "Z", "V", "M"));
}};
/**
* note: currently, parsing cmd is not done using indexes. this method might be slow.
*
* This is how cmd is structured, with the names used in this method: (correct formatting in the code file)
*
*
* {@code
*
*
* @param inflected the inflected form (a substring of the input string)
* @param cmd to be parsed. contains the info for reconstructing lemmas
* @return: parsed structure
*/
public TreeMap