Many resources are needed to download a project. Please understand that we have to compensate our server costs. Thank you in advance. Project price only 1 $
You can buy this project and download/modify it how often you want.
package edu.stanford.nlp.parser.lexparser;
import java.util.Map;
import java.util.Set;
import edu.stanford.nlp.io.EncodingPrintWriter;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.ling.WordTag;
import edu.stanford.nlp.ling.Tag;
import edu.stanford.nlp.ling.Label;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.Index;
/**
* Stores, trains, and scores with an unknown word model. A couple
* of filters deterministically force rewrites for certain proper
* nouns, dates, and cardinal and ordinal numbers; when none of these
* filters are met, either the distribution of terminals with the same
* first character is used, or Good-Turing smoothing is used. Although
* this is developed for Chinese, the training and storage methods
* could be used cross-linguistically.
*
* @author Roger Levy
*/
public class ChineseUnknownWordModel extends BaseUnknownWordModel {
private static final String encoding = "GB18030"; // used only for debugging
private final boolean useUnicodeType;
/* These strings are stored in ascii-type Unicode encoding. To
* edit them, either use the Unicode codes or use native2ascii or a
* similar program to convert the file into a Chinese encoding, then
* convert back. */
private static final String numberMatch = ".*[0-9\uff10-\uff19\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u767e\u5343\u4e07\u4ebf\u96F6\u3007\u25cb\u25ef].*";
private static final String dateMatch = numberMatch + "[\u5e74\u6708\u65e5\u53f7]";
private static final String ordinalMatch = "\u7b2c.*";
// uses midDot characters as one clue of being proper name
private static final String properNameMatch = ".*[\u00b7\u0387\u2022\u2024\u2027\u2219\u22C5\u30FB].*";
private final Set seenFirst;
public ChineseUnknownWordModel(Options op, Lexicon lex,
Index wordIndex,
Index tagIndex,
ClassicCounter unSeenCounter,
Map