All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.jaeksoft.searchlib.classifier.Classifier Maven / Gradle / Ivy

Go to download

OpenSearchServer is a powerful, enterprise-class, search engine program. Using the web user interface, the crawlers (web, file, database, ...) and the REST/RESTFul API you will be able to integrate quickly and easily advanced full-text search capabilities in your application. OpenSearchServer runs on Windows and Linux/Unix/BSD.

The newest version!
/**
 * License Agreement for OpenSearchServer
 * 

* Copyright (C) 2011-2013 Emmanuel Keller / Jaeksoft *

* http://www.open-search-server.com *

* This file is part of OpenSearchServer. *

* OpenSearchServer is free software: you can redistribute it and/or * modify it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. *

* OpenSearchServer is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. *

* You should have received a copy of the GNU General Public License * along with OpenSearchServer. * If not, see . **/ package com.jaeksoft.searchlib.classifier; import com.jaeksoft.searchlib.Client; import com.jaeksoft.searchlib.SearchLibException; import com.jaeksoft.searchlib.analysis.LanguageEnum; import com.jaeksoft.searchlib.function.expression.SyntaxError; import com.jaeksoft.searchlib.index.FieldContent; import com.jaeksoft.searchlib.index.IndexDocument; import com.jaeksoft.searchlib.query.ParseException; import com.jaeksoft.searchlib.util.DomUtils; import com.jaeksoft.searchlib.util.FormatUtils.ThreadSafeDecimalFormat; import com.jaeksoft.searchlib.util.ReadWriteLock; import com.jaeksoft.searchlib.util.XPathParser; import com.jaeksoft.searchlib.util.XmlWriter; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.index.memory.MemoryIndex; import org.w3c.dom.Document; import org.w3c.dom.Node; import org.xml.sax.SAXException; import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.stream.StreamSource; import javax.xml.xpath.XPathExpressionException; import java.io.File; import java.io.IOException; import java.util.List; import java.util.TreeSet; public class Classifier implements Comparable, XmlWriter.Interface { private final static String CLASSIFIER_ITEM_ROOTNODE_NAME = "classifier"; private final static String CLASSIFIER_ITEM_NODE_NAME = "classifierItem"; private final static String CLASSIFIER_ITEM_ROOT_ATTR_NAME = "name"; private final static String CLASSIFIER_ITEM_ROOT_ATTR_FIELD = "field"; private final static String CLASSIFIER_ITEM_ROOT_ATTR_SCOREFIELD = "scoreField"; private final static String CLASSIFIER_ITEM_ROOT_ATTR_ACTIVE = "active"; private final static String CLASSIFIER_ITEM_ROOT_ATTR_METHOD = "method"; private final static String CLASSIFIER_ITEM_DEFAULT_VALUE_NODE = "defaultValue"; private final ReadWriteLock rwl = new ReadWriteLock(); private String name; private String fieldName; private String scoreFieldName; private boolean active; private String defaultValue; private TreeSet valueSet; private ClassifierItem[] valueSetArray; private ClassificationMethodEnum method; public Classifier() { valueSetArray = null; valueSet = new TreeSet(); name = null; fieldName = null; scoreFieldName = null; active = false; defaultValue = null; method = ClassificationMethodEnum.BESTSCORE; } public Classifier(Classifier source) { this(); source.copyTo(this); } public void copyTo(Classifier target) { rwl.r.lock(); try { target.rwl.w.lock(); try { target.name = name; target.fieldName = fieldName; target.scoreFieldName = scoreFieldName; target.active = active; target.method = method; target.valueSetArray = null; target.defaultValue = defaultValue; target.valueSet.clear(); for (ClassifierItem item : valueSet) target.valueSet.add(new ClassifierItem(item)); target.buildValueSetArray(); } finally { target.rwl.w.unlock(); } } finally { rwl.r.unlock(); } } protected Classifier(File file) throws ParserConfigurationException, SAXException, IOException, XPathExpressionException, SearchLibException { this(); if (!file.exists()) return; Document document = DomUtils.readXml(new StreamSource(file), false); Node rootNode = DomUtils.getFirstNode(document, CLASSIFIER_ITEM_ROOTNODE_NAME); if (rootNode == null) return; setName(XPathParser.getAttributeString(rootNode, CLASSIFIER_ITEM_ROOT_ATTR_NAME)); setFieldName(XPathParser.getAttributeString(rootNode, CLASSIFIER_ITEM_ROOT_ATTR_FIELD)); setScoreFieldName(XPathParser.getAttributeString(rootNode, CLASSIFIER_ITEM_ROOT_ATTR_SCOREFIELD)); setActive("yes".equalsIgnoreCase(XPathParser.getAttributeString(rootNode, CLASSIFIER_ITEM_ROOT_ATTR_ACTIVE))); setMethod(ClassificationMethodEnum .find(XPathParser.getAttributeString(rootNode, CLASSIFIER_ITEM_ROOT_ATTR_METHOD))); Node defaultValueNode = DomUtils.getFirstNode(rootNode, CLASSIFIER_ITEM_DEFAULT_VALUE_NODE); if (defaultValueNode != null) setDefaultValue(defaultValueNode.getTextContent()); List nodes = DomUtils.getNodes(rootNode, CLASSIFIER_ITEM_NODE_NAME); if (nodes == null) return; for (Node n : nodes) addNoLock(new ClassifierItem(n)); buildValueSetArray(); } /** * @param name * the name to set */ public void setName(String name) { this.name = name; } /** * @return the name */ public String getName() { return name; } public ClassifierItem[] getValueSet() { rwl.r.lock(); try { return valueSetArray; } finally { rwl.r.unlock(); } } public int getValueSetSize() { rwl.r.lock(); try { return valueSetArray == null ? 0 : valueSetArray.length; } finally { rwl.r.unlock(); } } private final void buildValueSetArray() { valueSetArray = new ClassifierItem[valueSet.size()]; valueSet.toArray(valueSetArray); } private final void addNoLock(ClassifierItem item) { valueSet.add(item); } public void add(ClassifierItem item) throws SearchLibException { rwl.w.lock(); try { addNoLock(item); buildValueSetArray(); } finally { rwl.w.unlock(); } } public void replace(ClassifierItem oldItem, ClassifierItem newItem) { rwl.w.lock(); try { valueSet.remove(oldItem); valueSet.add(newItem); buildValueSetArray(); } finally { rwl.w.unlock(); } } public void remove(ClassifierItem item) { rwl.w.lock(); try { valueSet.remove(item); buildValueSetArray(); } finally { rwl.w.unlock(); } } /** * @param fieldName * the fieldName to set */ public void setFieldName(String fieldName) { rwl.w.lock(); try { this.fieldName = fieldName; } finally { rwl.w.unlock(); } } /** * @return the fieldName */ public String getFieldName() { return fieldName; } /** * @param scoreFieldName * the scoreFieldName to set */ public void setScoreFieldName(String scoreFieldName) { rwl.w.lock(); try { this.scoreFieldName = scoreFieldName; } finally { rwl.w.unlock(); } } /** * @return the scoreFieldName */ public String getScoreFieldName() { return scoreFieldName; } /** * @param active * the active to set */ public void setActive(boolean active) { this.active = active; } /** * @return the active */ public boolean isActive() { return active; } /** * @param method * the method to set */ public void setMethod(ClassificationMethodEnum method) { this.method = method; } /** * @return the method */ public ClassificationMethodEnum getMethod() { return method; } @Override public int compareTo(Classifier o) { return name.compareTo(o.name); } @Override public void writeXml(XmlWriter xmlWriter) throws SAXException { rwl.r.lock(); try { xmlWriter.startElement(CLASSIFIER_ITEM_ROOTNODE_NAME, CLASSIFIER_ITEM_ROOT_ATTR_NAME, name, CLASSIFIER_ITEM_ROOT_ATTR_FIELD, fieldName, CLASSIFIER_ITEM_ROOT_ATTR_SCOREFIELD, scoreFieldName, CLASSIFIER_ITEM_ROOT_ATTR_ACTIVE, active ? "yes" : "no", CLASSIFIER_ITEM_ROOT_ATTR_METHOD, method.name()); if (defaultValue != null && defaultValue.length() > 0) { xmlWriter.startElement(CLASSIFIER_ITEM_DEFAULT_VALUE_NODE); xmlWriter.textNode(defaultValue); xmlWriter.endElement(); } for (ClassifierItem item : valueSet) item.writeXml(xmlWriter, CLASSIFIER_ITEM_NODE_NAME); xmlWriter.endElement(); } finally { rwl.r.unlock(); } } private void multivaluedClassification(Client client, IndexDocument document, LanguageEnum lang, MemoryIndex index) throws ParseException, SearchLibException, SyntaxError, IOException { boolean setDefaultValue = defaultValue != null && defaultValue.length() > 0; for (ClassifierItem item : valueSet) { float score = item.score(client, lang, index); if (score > 0.0f) { document.add(fieldName, item.getValue(), item.getBoost()); if (scoreFieldName != null && scoreFieldName.length() > 0) document.addString(scoreFieldName, Float.toString(score)); setDefaultValue = false; } } if (setDefaultValue) document.add(fieldName, defaultValue, 1.0F); } private final static ThreadSafeDecimalFormat scoreFormat = new ThreadSafeDecimalFormat("0.###########"); private void bestScoreClassification(Client client, IndexDocument document, LanguageEnum lang, MemoryIndex index) throws ParseException, SearchLibException, SyntaxError, IOException { ClassifierItem selectedItem = null; float maxScore = 0; for (ClassifierItem item : valueSet) { float score = item.score(client, lang, index); if (score > maxScore) { selectedItem = item; maxScore = score; } } if (selectedItem != null) { document.add(getFieldName(), selectedItem.getValue(), selectedItem.getBoost()); if (scoreFieldName != null && scoreFieldName.length() > 0) { document.addString(scoreFieldName, scoreFormat.format(maxScore)); } } else { if (defaultValue != null && defaultValue.length() > 0) document.add(fieldName, defaultValue, 1.0F); } } public void classification(Client client, IndexDocument document) throws SearchLibException, ParseException, SyntaxError, IOException { rwl.r.lock(); try { MemoryIndex index = new MemoryIndex(); LanguageEnum lang = document.getLang(); Analyzer analyzer = client.getSchema().getIndexPerFieldAnalyzer(lang); for (FieldContent fieldContent : document) { String fieldName = fieldContent.getField(); String concatValues = fieldContent.getMergedValues(" "); index.addField(fieldName, concatValues, analyzer); } if (method == ClassificationMethodEnum.MULTIVALUED) multivaluedClassification(client, document, lang, index); else if (method == ClassificationMethodEnum.BESTSCORE) bestScoreClassification(client, document, lang, index); } finally { rwl.r.unlock(); } } /** * @return the defaultValue */ public String getDefaultValue() { rwl.r.lock(); try { return defaultValue; } finally { rwl.r.unlock(); } } /** * @param defaultValue * the defaultValue to set */ public void setDefaultValue(String defaultValue) { rwl.w.lock(); try { this.defaultValue = defaultValue; } finally { rwl.w.unlock(); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy