
com.jaeksoft.searchlib.classifier.Classifier Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of opensearchserver Show documentation
Show all versions of opensearchserver Show documentation
OpenSearchServer is a powerful, enterprise-class, search engine program. Using the web user interface,
the crawlers (web, file, database, ...) and the REST/RESTFul API you will be able to integrate quickly and
easily advanced full-text search capabilities in your application. OpenSearchServer runs on Windows and
Linux/Unix/BSD.
/**
* License Agreement for OpenSearchServer
*
* Copyright (C) 2011-2013 Emmanuel Keller / Jaeksoft
*
* http://www.open-search-server.com
*
* This file is part of OpenSearchServer.
*
* OpenSearchServer is free software: you can redistribute it and/or
* modify it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* OpenSearchServer is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with OpenSearchServer.
* If not, see .
**/
package com.jaeksoft.searchlib.classifier;
import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.TreeSet;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.stream.StreamSource;
import javax.xml.xpath.XPathExpressionException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.memory.MemoryIndex;
import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.xml.sax.SAXException;
import com.jaeksoft.searchlib.Client;
import com.jaeksoft.searchlib.SearchLibException;
import com.jaeksoft.searchlib.analysis.LanguageEnum;
import com.jaeksoft.searchlib.function.expression.SyntaxError;
import com.jaeksoft.searchlib.index.FieldContent;
import com.jaeksoft.searchlib.index.IndexDocument;
import com.jaeksoft.searchlib.query.ParseException;
import com.jaeksoft.searchlib.util.DomUtils;
import com.jaeksoft.searchlib.util.FormatUtils.ThreadSafeDecimalFormat;
import com.jaeksoft.searchlib.util.ReadWriteLock;
import com.jaeksoft.searchlib.util.XPathParser;
import com.jaeksoft.searchlib.util.XmlWriter;
public class Classifier implements Comparable {
private final static String CLASSIFIER_ITEM_ROOTNODE_NAME = "classifier";
private final static String CLASSIFIER_ITEM_NODE_NAME = "classifierItem";
private final static String CLASSIFIER_ITEM_ROOT_ATTR_NAME = "name";
private final static String CLASSIFIER_ITEM_ROOT_ATTR_FIELD = "field";
private final static String CLASSIFIER_ITEM_ROOT_ATTR_SCOREFIELD = "scoreField";
private final static String CLASSIFIER_ITEM_ROOT_ATTR_ACTIVE = "active";
private final static String CLASSIFIER_ITEM_ROOT_ATTR_METHOD = "method";
private final static String CLASSIFIER_ITEM_DEFAULT_VALUE_NODE = "defaultValue";
private final ReadWriteLock rwl = new ReadWriteLock();
private String name;
private String fieldName;
private String scoreFieldName;
private boolean active;
private String defaultValue;
private TreeSet valueSet;
private ClassifierItem[] valueSetArray;
private ClassificationMethodEnum method;
public Classifier() {
valueSetArray = null;
valueSet = new TreeSet();
name = null;
fieldName = null;
scoreFieldName = null;
active = false;
defaultValue = null;
method = ClassificationMethodEnum.BESTSCORE;
}
public Classifier(Classifier source) {
this();
source.copyTo(this);
}
public void copyTo(Classifier target) {
rwl.r.lock();
try {
target.rwl.w.lock();
try {
target.name = name;
target.fieldName = fieldName;
target.scoreFieldName = scoreFieldName;
target.active = active;
target.method = method;
target.valueSetArray = null;
target.defaultValue = defaultValue;
target.valueSet.clear();
for (ClassifierItem item : valueSet)
target.valueSet.add(new ClassifierItem(item));
target.buildValueSetArray();
} finally {
target.rwl.w.unlock();
}
} finally {
rwl.r.unlock();
}
}
protected Classifier(File file) throws ParserConfigurationException,
SAXException, IOException, XPathExpressionException,
SearchLibException {
this();
if (!file.exists())
return;
Document document = DomUtils.readXml(new StreamSource(file), false);
Node rootNode = DomUtils.getFirstNode(document,
CLASSIFIER_ITEM_ROOTNODE_NAME);
if (rootNode == null)
return;
setName(XPathParser.getAttributeString(rootNode,
CLASSIFIER_ITEM_ROOT_ATTR_NAME));
setFieldName(XPathParser.getAttributeString(rootNode,
CLASSIFIER_ITEM_ROOT_ATTR_FIELD));
setScoreFieldName(XPathParser.getAttributeString(rootNode,
CLASSIFIER_ITEM_ROOT_ATTR_SCOREFIELD));
setActive("yes".equalsIgnoreCase(XPathParser.getAttributeString(
rootNode, CLASSIFIER_ITEM_ROOT_ATTR_ACTIVE)));
setMethod(ClassificationMethodEnum.find(XPathParser.getAttributeString(
rootNode, CLASSIFIER_ITEM_ROOT_ATTR_METHOD)));
Node defaultValueNode = DomUtils.getFirstNode(rootNode,
CLASSIFIER_ITEM_DEFAULT_VALUE_NODE);
if (defaultValueNode != null)
setDefaultValue(defaultValueNode.getTextContent());
List nodes = DomUtils.getNodes(rootNode,
CLASSIFIER_ITEM_NODE_NAME);
if (nodes == null)
return;
for (Node n : nodes)
addNoLock(new ClassifierItem(n));
buildValueSetArray();
}
/**
* @param name
* the name to set
*/
public void setName(String name) {
this.name = name;
}
/**
* @return the name
*/
public String getName() {
return name;
}
public ClassifierItem[] getValueSet() {
rwl.r.lock();
try {
return valueSetArray;
} finally {
rwl.r.unlock();
}
}
public int getValueSetSize() {
rwl.r.lock();
try {
return valueSetArray == null ? 0 : valueSetArray.length;
} finally {
rwl.r.unlock();
}
}
private final void buildValueSetArray() {
valueSetArray = new ClassifierItem[valueSet.size()];
valueSet.toArray(valueSetArray);
}
private final void addNoLock(ClassifierItem item) {
valueSet.add(item);
}
public void add(ClassifierItem item) throws SearchLibException {
rwl.w.lock();
try {
addNoLock(item);
buildValueSetArray();
} finally {
rwl.w.unlock();
}
}
public void replace(ClassifierItem oldItem, ClassifierItem newItem) {
rwl.w.lock();
try {
valueSet.remove(oldItem);
valueSet.add(newItem);
buildValueSetArray();
} finally {
rwl.w.unlock();
}
}
public void remove(ClassifierItem item) {
rwl.w.lock();
try {
valueSet.remove(item);
buildValueSetArray();
} finally {
rwl.w.unlock();
}
}
/**
* @param fieldName
* the fieldName to set
*/
public void setFieldName(String fieldName) {
rwl.w.lock();
try {
this.fieldName = fieldName;
} finally {
rwl.w.unlock();
}
}
/**
* @return the fieldName
*/
public String getFieldName() {
return fieldName;
}
/**
* @param scoreFieldName
* the scoreFieldName to set
*/
public void setScoreFieldName(String scoreFieldName) {
rwl.w.lock();
try {
this.scoreFieldName = scoreFieldName;
} finally {
rwl.w.unlock();
}
}
/**
* @return the scoreFieldName
*/
public String getScoreFieldName() {
return scoreFieldName;
}
/**
* @param active
* the active to set
*/
public void setActive(boolean active) {
this.active = active;
}
/**
* @return the active
*/
public boolean isActive() {
return active;
}
/**
* @param method
* the method to set
*/
public void setMethod(ClassificationMethodEnum method) {
this.method = method;
}
/**
* @return the method
*/
public ClassificationMethodEnum getMethod() {
return method;
}
@Override
public int compareTo(Classifier o) {
return name.compareTo(o.name);
}
public void writeXml(XmlWriter xmlWriter) throws SAXException {
rwl.r.lock();
try {
xmlWriter.startElement(CLASSIFIER_ITEM_ROOTNODE_NAME,
CLASSIFIER_ITEM_ROOT_ATTR_NAME, name,
CLASSIFIER_ITEM_ROOT_ATTR_FIELD, fieldName,
CLASSIFIER_ITEM_ROOT_ATTR_SCOREFIELD, scoreFieldName,
CLASSIFIER_ITEM_ROOT_ATTR_ACTIVE, active ? "yes" : "no",
CLASSIFIER_ITEM_ROOT_ATTR_METHOD, method.name());
if (defaultValue != null && defaultValue.length() > 0) {
xmlWriter.startElement(CLASSIFIER_ITEM_DEFAULT_VALUE_NODE);
xmlWriter.textNode(defaultValue);
xmlWriter.endElement();
}
for (ClassifierItem item : valueSet)
item.writeXml(xmlWriter, CLASSIFIER_ITEM_NODE_NAME);
xmlWriter.endElement();
} finally {
rwl.r.unlock();
}
}
private void multivaluedClassification(Client client,
IndexDocument document, LanguageEnum lang, MemoryIndex index)
throws ParseException, SearchLibException, SyntaxError, IOException {
boolean setDefaultValue = defaultValue != null
&& defaultValue.length() > 0;
for (ClassifierItem item : valueSet) {
float score = item.score(client, lang, index);
if (score > 0.0f) {
document.add(fieldName, item.getValue(), item.getBoost());
if (scoreFieldName != null && scoreFieldName.length() > 0)
document.addString(scoreFieldName, Float.toString(score));
setDefaultValue = false;
}
}
if (setDefaultValue)
document.add(fieldName, defaultValue, 1.0F);
}
private final static ThreadSafeDecimalFormat scoreFormat = new ThreadSafeDecimalFormat(
"0.###########");
private void bestScoreClassification(Client client, IndexDocument document,
LanguageEnum lang, MemoryIndex index) throws ParseException,
SearchLibException, SyntaxError, IOException {
ClassifierItem selectedItem = null;
float maxScore = 0;
for (ClassifierItem item : valueSet) {
float score = item.score(client, lang, index);
if (score > maxScore) {
selectedItem = item;
maxScore = score;
}
}
if (selectedItem != null) {
document.add(getFieldName(), selectedItem.getValue(),
selectedItem.getBoost());
if (scoreFieldName != null && scoreFieldName.length() > 0) {
document.addString(scoreFieldName, scoreFormat.format(maxScore));
}
} else {
if (defaultValue != null && defaultValue.length() > 0)
document.add(fieldName, defaultValue, 1.0F);
}
}
public void classification(Client client, IndexDocument document)
throws SearchLibException, ParseException, SyntaxError, IOException {
rwl.r.lock();
try {
MemoryIndex index = new MemoryIndex();
LanguageEnum lang = document.getLang();
Analyzer analyzer = client.getSchema().getIndexPerFieldAnalyzer(
lang);
for (FieldContent fieldContent : document) {
String fieldName = fieldContent.getField();
String concatValues = fieldContent.getMergedValues(" ");
index.addField(fieldName, concatValues, analyzer);
}
if (method == ClassificationMethodEnum.MULTIVALUED)
multivaluedClassification(client, document, lang, index);
else if (method == ClassificationMethodEnum.BESTSCORE)
bestScoreClassification(client, document, lang, index);
} finally {
rwl.r.unlock();
}
}
/**
* @return the defaultValue
*/
public String getDefaultValue() {
rwl.r.lock();
try {
return defaultValue;
} finally {
rwl.r.unlock();
}
}
/**
* @param defaultValue
* the defaultValue to set
*/
public void setDefaultValue(String defaultValue) {
rwl.w.lock();
try {
this.defaultValue = defaultValue;
} finally {
rwl.w.unlock();
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy