com.force.i18n.grammar.parser.GrammaticalLabelFileParser Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of grammaticus Show documentation
Show all versions of grammaticus Show documentation
Localization Framework that allows grammatically correct renaming of nouns
/*
* Copyright (c) 2017, salesforce.com, inc.
* All rights reserved.
* Licensed under the BSD 3-Clause license.
* For full license text, see LICENSE.txt file in the repo root or https://opensource.org/licenses/BSD-3-Clause
*/
package com.force.i18n.grammar.parser;
import static com.force.i18n.commons.util.settings.IniFileUtil.intern;
import java.io.*;
import java.net.URL;
import java.net.URLConnection;
import java.text.MessageFormat;
import java.util.*;
import java.util.concurrent.ConcurrentHashMap;
import java.util.logging.Logger;
import javax.xml.parsers.*;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import com.force.i18n.*;
import com.force.i18n.commons.text.GenericUniquefy;
import com.force.i18n.commons.text.Uniquefy;
import com.force.i18n.grammar.*;
import com.force.i18n.settings.*;
import com.force.i18n.settings.BasePropertyFile.MetaDataInfo;
import com.google.common.base.Splitter;
import com.google.common.collect.*;
/**
* Holder of information while parsing a set of Label files.
*
* @author stamm
*/
public class GrammaticalLabelFileParser implements BasePropertyFile.Parser {
private static final Logger logger = Logger.getLogger(GrammaticalLabelFileParser.class.getName());
private final LanguageDictionary dictionary;
private final LabelSetDescriptor desc;
private final GrammaticalLabelSetProvider parentProvider;
private final boolean trackDupes;
private final Map aliasMap = new HashMap<>();
private final Multimap allLabels = TreeMultimap.create();
private Uniquefy uniquefy = new Uniquefy();
private GenericUniquefy aliasUniquefy = new GenericUniquefy<>();
private List illegalAliases;
private Set invalidLabels; // The set of labels that have a "problem" with them
private long lastModified = -1;
private Map sectionToFileName = new ConcurrentHashMap<>();;
/**
* Construct a label file parser
* @param dictionary the dictionary to fill in with nouns and adjectives
* @param labelDesc the descriptor of the location of the label set
* @param parentProvider the parent labelset if this is overriding the labels and is a child labelset
* @throws IOException if there is an exception while parsing
*/
public GrammaticalLabelFileParser(LanguageDictionary dictionary, LabelSetDescriptor labelDesc, GrammaticalLabelSetProvider parentProvider) throws IOException {
this(dictionary, labelDesc, parentProvider,
dictionary.getLanguage() == LanguageProviderFactory.get().getBaseLanguage()
&& isDupeLabelTrackingEnabled());
}
public GrammaticalLabelFileParser(LanguageDictionary dictionary, LabelSetDescriptor labelDesc, GrammaticalLabelSetProvider parentProvider, boolean trackDupes) {
this.dictionary = dictionary;
this.desc = labelDesc;
this.parentProvider = parentProvider;
this.trackDupes = trackDupes;
}
@Override
public void load(PropertyFileData data, Map> metaData) throws IOException {
// be nice if we removed the special condition on train.xml in LabelHandler.java
boolean found = false;
if (this.desc instanceof TestLanguageLabelSetDescriptor) {
parseLabels(data, ((TestLanguageLabelSetDescriptor)this.desc).getText());
found = true;
} else if (this.desc.hasOverridingFiles()) {
// Parse only non-english labels
for (URL labelFile : this.desc.getOverridingFiles()) {
if (TrackingHandler.exists(labelFile)) {
found = true;
parseLabels(data, labelFile);
}
}
} else {
if(this.desc.hasModularizedFiles()) {
for(URL modularizedFile : this.desc.getModularizedFiles()) {
found = true;
parseLabels(data, modularizedFile);
}
} else if (TrackingHandler.exists(this.desc.getRootFile())) {
// Parse english labels
found = true;
parseLabels(data, this.desc.getRootFile());
}
}
if (!found) {
// TODO: Vietnamese doesn't exist yet (new languages don't), so we pretty much just ignore it
//throw new java.io.FileNotFoundException("can't read label file: " + this.desc.toString());
}
}
@Override
public long getFileLastModified() {
// part of the Parser interface
return this.lastModified;
}
public URL getRootDir() {
return this.desc.getRootDir();
}
public LanguageDictionary getDictionary() {
return this.dictionary;
}
public long getLastModified() {
return this.lastModified;
}
public Map getSectionToFileName() {
return this.sectionToFileName;
}
protected void parseLabels(PropertyFileData data, URL file) {
GrammaticalLabelFileHandler handler = new GrammaticalLabelFileHandler(file, data, this);
parse(file, handler);
}
protected void parseLabels(PropertyFileData data, String labelText) {
GrammaticalLabelFileHandler handler = new GrammaticalLabelFileHandler(this.desc.getRootFile(), data, this);
parse(new InputSource(new StringReader(labelText)), handler);
}
protected SAXParser getSAXParser(boolean doValidation) throws ParserConfigurationException, SAXException {
SAXParserFactory spf = SAXParserFactory.newInstance();
spf.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
spf.setFeature("http://xml.org/sax/features/external-general-entities", false);
spf.setFeature("http://xml.org/sax/features/external-parameter-entities", false);
spf.setNamespaceAware(true);
if (doValidation) spf.setValidating(false);
return spf.newSAXParser();
}
protected void parse(URL file, TrackingHandler handler) {
try {
SAXParser saxParser = getSAXParser(false);
URLConnection connection = file.openConnection();
connection.connect();
this.lastModified = Math.max(this.lastModified, connection.getLastModified());
saxParser.parse(new BufferedInputStream(connection.getInputStream()), handler);
}
catch (ParserConfigurationException | SAXException | IOException ex) {
throw new RuntimeException("Error parsing XML file " + handler.getLineNumberString(), ex);
}
}
protected void parse(InputSource source, TrackingHandler handler) {
try {
SAXParser saxParser = getSAXParser(true);
saxParser.getXMLReader().setEntityResolver(handler);
saxParser.parse(source, handler);
}
catch (ParserConfigurationException | SAXException | IOException ex) {
throw new RuntimeException("Error parsing XML file " + handler.getLineNumberString(), ex);
}
}
boolean trackDupes() {
return this.trackDupes;
}
static boolean isDupeLabelTrackingEnabled() {
return false;
}
String uniquefy(String label) {
return uniquefy.unique(label);
}
void trackLabel(String label, String location) {
if (this.trackDupes) allLabels.put(label, location);
}
List writeDuplicateLabelsFile(Writer out) throws IOException {
List result = new ArrayList<>();
try {
long totalWords = 0;
long dupeWords = 0;
long totalLabels = 0;
long dupeLabels = 0;
for (Map.Entry> entry : this.allLabels.asMap().entrySet()) {
int count = entry.getValue().size();
totalLabels += count;
dupeLabels += (count - 1);
int words = Iterables.size(Splitter.on(' ').split(entry.getKey()));
totalWords += words * count;
dupeWords += words * (count - 1);
if (count > 1) {
String msg = entry.getKey() + ": " + entry.getValue().size() + " dupes: " + entry.getValue();
result.add(msg);
if (out != null) {
out.write(msg);
out.write('\n');
}
}
}
if (out != null) {
out.write("\n\n************ dupe labels: " + dupeLabels + "\n");
out.write("************ total labels: " + totalLabels + "\n");
out.write("************ dupe words: " + dupeWords + "\n");
out.write("************ total words: " + totalWords + "\n");
}
this.allLabels.clear();
return result;
} finally {
if (out != null) out.close();
}
}
List getIllegalAliases() { return this.illegalAliases; }
public Set getInvalidLabels() { return this.invalidLabels; }
// ====================================================================
// Param alias handler:
// ====================================================================
private static final String BAD_ALIAS = "###\tBad alias: ";
class AliasParam implements Comparable {
final URL file;
final int lineNumber;
final String srcSection, srcParam, dstSection, dstParam;
final String srcKey; // Memoize since it's used so often
final String dstKey; // Memoize since it's used so often
boolean ignore = false;
AliasParam(String srcSection, String srcParam, String dstSection, String dstParam, URL file, int lineNumber) {
this.file = file;
this.lineNumber = lineNumber;
this.srcSection = srcSection;
this.srcParam = srcParam;
this.dstSection = dstSection.trim();
this.dstParam = dstParam.trim();
this.srcKey = GrammaticalLabelFileParser.getKey(this.srcSection, this.srcParam);
this.dstKey = GrammaticalLabelFileParser.getKey(this.dstSection, this.dstParam);
// ignore self-reference
this.ignore = srcSection.equals(dstSection) && srcParam.equals(dstParam);
if (this.ignore)
error("Circular reference at ", getKey());
}
String getKey() {
return this.srcKey;
}
String getTargetKey() {
return this.dstKey;
}
void error(String msg, String key) {
String fileMsg = "";
if (file != null) {
fileMsg = this.file.getPath() + "(" + lineNumber + "): ";
}
String message = BAD_ALIAS + fileMsg + msg + (key == null ? "" : key);
// if we are loading English labels in dev-mode throw an exception here
if (I18nJavaUtil.isDebugging() && LanguageProviderFactory.get().getBaseLanguage() == GrammaticalLabelFileParser.this.getDictionary().getLanguage()
&& !(GrammaticalLabelFileParser.this.desc instanceof TestLanguageLabelSetDescriptor)) {
throw new IllegalStateException(message);
} else {
if (illegalAliases == null) illegalAliases = new ArrayList(10);
illegalAliases.add(this);
// Oh, just keep going
logger.fine(message);
}
}
private Object validateDst(LabelSet label) {
Map sectionMap = label.getSection(dstSection);
if (sectionMap == null) {
error("Section " + dstSection + " does not exist.", null);
return makeLabelRef(dstSection, dstParam); // Let it die later
}
Object val = sectionMap.get(dstParam);
if (val == null) {
error("Param does not exist: ", getTargetKey());
}
return val != null ? makeLabelRef(dstSection, dstParam) : null;
}
LabelRef makeLabelRef(String section, String param) {
// use inern(String) for section/param name as these are always reused
return aliasUniquefy.unique(new LabelRef(intern(section), intern(param)));
}
@Override
public String toString() {
return this.file.getPath() + "(" + lineNumber + "): " + srcKey + "->" + dstKey;
}
@Override
public int hashCode() {
return Objects.hash(file, lineNumber);
}
@Override
public boolean equals(Object obj) {
if (this == obj) return true;
if (obj == null || getClass() != obj.getClass()) return false;
AliasParam other = (AliasParam)obj;
return Objects.equals(file, other.file) && lineNumber == other.lineNumber;
}
@Override
public int compareTo(AliasParam o) {
int fileCompare = this.file.getPath().compareTo(o.file.getPath());
if (fileCompare == 0) {
if (this.lineNumber < o.lineNumber) {
return -1;
} else if (this.lineNumber == o.lineNumber) {
return 0;
} else {
return 1;
}
} else {
return fileCompare;
}
}
}
static final String getKey(String sec, String param) {
return sec + "." + param;
}
ErrorInfo addInvalidLabel(ErrorType type, String section, String key, URL file, int lineNumber, Object... args) {
ErrorInfo error = new ErrorInfo(type, section, key, file, lineNumber, args);
if (this.invalidLabels == null) this.invalidLabels = new TreeSet();
this.invalidLabels.add(error);
return error;
}
void addAlias(String srcSection, String srcParam, String dstSection, String dstParam, URL file, int lineNumber) {
AliasParam alias = new AliasParam(srcSection, srcParam, dstSection, dstParam, file, lineNumber);
this.aliasMap.put(alias.getKey(), alias);
}
void removeAlias(String srcSection, String srcParam) {
assert this.aliasMap != null;
String k = getKey(srcSection, srcParam);
if (this.aliasMap.containsKey(k)) {
this.aliasMap.remove(k);
}
}
Map getAliasMap() {
return Collections.unmodifiableMap(this.aliasMap);
}
// Provide the set you are writing to
public void close(GrammaticalLabelSetImpl writeSet) {
GrammaticalLabelSet parentSet = parentProvider != null ? parentProvider.getSet(dictionary.getLanguage()) : null;
resolveAliases(parentSet != null ? new GrammaticalLabelSetFallbackImpl(writeSet, parentSet) : writeSet, writeSet);
}
/**
* Resolve all alias value in <param> tag. This param tag suppose to exists in {@code label}, with null
* value.
*
* @param labelSet
* LabelSet to resolve alias values
* @param writeSet the label set to write to
*/
void resolveAliases(GrammaticalLabelSet labelSet, GrammaticalLabelSetImpl writeSet) {
if (this.aliasMap.isEmpty())
return;
// Use array to get all values instead of using Iterator. This is because resolveAlias
// internally removes resolved value from aliasMap. If multiple alias(es) are recursively resolved,
// it may remove multiple values from the map.
for (AliasParam ap : new HashSet(this.aliasMap.values())) {
if (this.aliasMap.containsKey(ap.getKey())) {
resolveAlias(labelSet, writeSet, ap, null);
}
}
assert this.aliasMap.isEmpty() : "Unresolved aliases: " + this.aliasMap;
this.aliasUniquefy = null;
}
/**
* resolve single alias value
*
* @param labelSet
* @param ap
* alias to evaluate. must NOT null
* @param refSet
* null if this is top level of alias chain.
* @return resolved value or null otherwise
*/
private Object resolveAlias(GrammaticalLabelSet labelSet, GrammaticalLabelSetImpl writeSet, AliasParam ap, Set refSet) {
Object retValue = null;
if (ap.ignore) {
retValue = "";
} else {
// dst could be either valid value, or another alias
AliasParam t = this.aliasMap.get(ap.getTargetKey());
if (t == null) {
// target is not an alias. Set dst to the current value
retValue = ap.validateDst(labelSet);
} else if (refSet != null && refSet.contains(t.getKey())) {
// target key exists in the recursive alias chain, that means this is
// circular reference.
ap.error("Circular reference at ", t.getKey());
addInvalidLabel(ErrorType.BadAlias, ap.srcSection, ap.srcParam, ap.file, ap.lineNumber, t.getKey());
} else {
// recursive reference. Keep tracking down
Set localRefSet = refSet;
if (refSet == null) {
// means this is the top level (or maybe middle) of alias chain
localRefSet = new HashSet<>();
} else {
// recursively called from alias chain. add to the chain list
refSet.add(ap.getKey());
}
retValue = resolveAlias(labelSet, writeSet, t, localRefSet);
if (retValue == null)
ap.error("Param does not exist: ", ap.getKey());
// if the current stack level is the top of the chain, remove resolved values from the map.
// localRefSet contains only descendants.
if (refSet == null && !localRefSet.isEmpty()) {
for (String string : localRefSet) {
this.aliasMap.remove(string);
}
}
}
}
writeSet.put(ap.srcSection, ap.srcParam, (retValue == null ? "" : retValue));
this.aliasMap.remove(ap.getKey());
return retValue;
}
public enum ErrorType {
// noun does not exist
UnknownEntity("Unknown entity <{0}>"),
// bad alias -- circular reference
BadAlias("Bad alias: Circular reference \"{0}\""),
// bad 'num' value in a plural tag
BadPluralReference("Bad plural reference <{0}>"),
// duplicate 'val' found in a when tag
DuplicateWhen("Duplicate when <{0}>"),
// TODO: this would never returned for or tags
BadDefault("You cannot specify {0} for a when and have default values"),
// bad 'val' in a when tag
BadCategory("Bad category <{0}>"),
// other error -- may be unused
Unknown("Unknown Error");
private final String errorMessage;
ErrorType(String msg) {
this.errorMessage = msg;
}
public String getMessage(ErrorInfo ref) {
String ret = (ref.getArguments() == null || ref.getArguments().length == 0) ? errorMessage
: MessageFormat.format(errorMessage, ref.getArguments());
return (ref == null) ? ret : ret + " at " + ref.toString();
}
}
/**
* Container class for parser error.
* @see GrammaticalLabelFileParser#getInvalidLabels()
* @since 226
*/
public class ErrorInfo extends LabelRef {
private static final long serialVersionUID = 1L;
public final ErrorType type;
public final URL file;
public final int lineNumber;
ErrorInfo(ErrorType type, String section, String key, URL file, int lineNumber, Object...args) {
super(section, key, args);
this.type = type;
this.file = file;
this.lineNumber = lineNumber;
}
public String getMessage() {
return this.type.getMessage(this);
}
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy