org.datacleaner.beans.transform.SynonymLookupTransformer Maven / Gradle / Ivy
/**
* DataCleaner (community edition)
* Copyright (C) 2014 Neopost - Customer Information Management
*
* This copyrighted material is made available to anyone wishing to use, modify,
* copy, or redistribute it subject to the terms and conditions of the GNU
* Lesser General Public License, as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this distribution; if not, write to:
* Free Software Foundation, Inc.
* 51 Franklin Street, Fifth Floor
* Boston, MA 02110-1301 USA
*/
package org.datacleaner.beans.transform;
import java.util.List;
import javax.inject.Inject;
import javax.inject.Named;
import org.apache.metamodel.util.HasName;
import org.datacleaner.api.Alias;
import org.datacleaner.api.Categorized;
import org.datacleaner.api.Close;
import org.datacleaner.api.Configured;
import org.datacleaner.api.Description;
import org.datacleaner.api.ExternalDocumentation;
import org.datacleaner.api.ExternalDocumentation.DocumentationLink;
import org.datacleaner.api.ExternalDocumentation.DocumentationType;
import org.datacleaner.api.HasLabelAdvice;
import org.datacleaner.api.Initialize;
import org.datacleaner.api.InputColumn;
import org.datacleaner.api.InputRow;
import org.datacleaner.api.OutputColumns;
import org.datacleaner.api.Provided;
import org.datacleaner.api.Transformer;
import org.datacleaner.components.categories.ImproveSuperCategory;
import org.datacleaner.components.categories.ReferenceDataCategory;
import org.datacleaner.configuration.DataCleanerConfiguration;
import org.datacleaner.reference.SynonymCatalog;
import org.datacleaner.reference.SynonymCatalogConnection;
import com.google.common.base.Joiner;
/**
* A simple transformer that uses a synonym catalog to replace a synonym with
* it's master term.
*/
@Named("Synonym lookup")
@Alias("Synonym replacement")
@Description("Replaces strings with their synonyms")
@ExternalDocumentation({ @DocumentationLink(title = "Segmenting customers on messy data",
url = "https://www.youtube.com/watch?v=iy-j5s-uHz4", type = DocumentationType.VIDEO, version = "4.0"),
@DocumentationLink(title = "Understanding and using Synonyms",
url = "https://www.youtube.com/watch?v=_YiPaA8bFt4", type = DocumentationType.VIDEO, version = "2.0") })
@Categorized(superCategory = ImproveSuperCategory.class, value = ReferenceDataCategory.class)
public class SynonymLookupTransformer implements Transformer, HasLabelAdvice {
public enum ReplacedSynonymsType implements HasName {
STRING("String"), LIST("List");
private final String _name;
ReplacedSynonymsType(final String name) {
_name = name;
}
@Override
public String getName() {
return _name;
}
}
@Configured
InputColumn column;
@Configured
SynonymCatalog synonymCatalog;
@Configured
@Description("Retain original value when no synonyms are found. If turned off, "
+ " will be returned when no synonyms are found.")
boolean retainOriginalValue = true;
@Configured
@Alias("Look up every token")
@Description("Replace synonyms that occur as a substring within the complete text? If turned off, "
+ "only synonyms that match the complete text value will be replaced.")
boolean replaceInlinedSynonyms = true;
@Inject
@Configured
@Description("How should the synonyms and the master terms that replaced them be returned?"
+ " As a concatenated String or as a List.")
ReplacedSynonymsType replacedSynonymsType = ReplacedSynonymsType.STRING;
@Provided
DataCleanerConfiguration configuration;
private SynonymCatalogConnection synonymCatalogConnection;
public SynonymLookupTransformer() {
}
public SynonymLookupTransformer(final InputColumn column, final SynonymCatalog synonymCatalog,
final boolean retainOriginalValue, final DataCleanerConfiguration configuration) {
this();
this.column = column;
this.synonymCatalog = synonymCatalog;
this.retainOriginalValue = retainOriginalValue;
this.configuration = configuration;
}
@Override
public OutputColumns getOutputColumns() {
final Class>[] columnTypes;
if (replacedSynonymsType == ReplacedSynonymsType.STRING) {
columnTypes = new Class[] { String.class, String.class, String.class };
} else {
columnTypes = new Class[] { String.class, List.class, List.class };
}
return new OutputColumns(
new String[] { column.getName() + " (synonyms replaced)", column.getName() + " (synonyms found)",
column.getName() + " (master terms found)" }, columnTypes);
}
@Override
public String getSuggestedLabel() {
if (synonymCatalog == null) {
return null;
}
return "Lookup: " + synonymCatalog.getName();
}
@Initialize
public void init() {
synonymCatalogConnection = synonymCatalog.openConnection(configuration);
}
@Close
public void close() {
if (synonymCatalogConnection != null) {
synonymCatalogConnection.close();
synonymCatalogConnection = null;
}
}
@Override
public Object[] transform(final InputRow inputRow) {
final String originalValue = inputRow.getValue(column);
if (originalValue == null) {
return new String[3];
}
if (replaceInlinedSynonyms) {
final SynonymCatalogConnection.Replacement replacement =
synonymCatalogConnection.replaceInline(originalValue);
if (replacedSynonymsType == ReplacedSynonymsType.STRING) {
return new Object[] { replacement.getReplacedString(), Joiner.on(' ').join(replacement.getSynonyms()),
Joiner.on(' ').join(replacement.getMasterTerms()) };
} else {
return new Object[] { replacement.getReplacedString(), replacement.getSynonyms(),
replacement.getMasterTerms() };
}
} else {
final String masterTerm = synonymCatalogConnection.getMasterTerm(originalValue);
final Object lookupResult = masterTerm != null ? masterTerm : (retainOriginalValue ? originalValue : null);
final Object synonym = masterTerm != null ? originalValue : null;
return new Object[] { lookupResult, synonym, masterTerm };
}
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy