com.jaeksoft.searchlib.learning.StandardLearner Maven / Gradle / Ivy

Go to download
Show more of this group Show more artifacts with this name
Show all versions of opensearchserver Show documentation
OpenSearchServer is a powerful, enterprise-class, search engine program. Using the web user interface, the crawlers (web, file, database, ...) and the REST/RESTFul API you will be able to integrate quickly and easily advanced full-text search capabilities in your application. OpenSearchServer runs on Windows and Linux/Unix/BSD.
The newest version!
/**   
 * License Agreement for OpenSearchServer
 *
 * Copyright (C) 2013 Emmanuel Keller / Jaeksoft
 * 
 * http://www.open-search-server.com
 * 
 * This file is part of OpenSearchServer.
 *
 * OpenSearchServer is free software: you can redistribute it and/or
 * modify it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 *  (at your option) any later version.
 *
 * OpenSearchServer is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with OpenSearchServer. 
 *  If not, see .
 **/

package com.jaeksoft.searchlib.learning;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.TreeMap;

import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;

import com.fasterxml.jackson.core.JsonParseException;
import com.fasterxml.jackson.databind.JsonMappingException;
import com.jaeksoft.searchlib.Client;
import com.jaeksoft.searchlib.SearchLibException;
import com.jaeksoft.searchlib.analysis.Analyzer;
import com.jaeksoft.searchlib.analysis.LanguageEnum;
import com.jaeksoft.searchlib.crawler.FieldMap;
import com.jaeksoft.searchlib.index.IndexDocument;
import com.jaeksoft.searchlib.request.AbstractLocalSearchRequest;
import com.jaeksoft.searchlib.request.AbstractSearchRequest;
import com.jaeksoft.searchlib.result.AbstractResultSearch;
import com.jaeksoft.searchlib.result.ResultDocument;
import com.jaeksoft.searchlib.result.ResultSearchSingle;
import com.jaeksoft.searchlib.schema.Indexed;
import com.jaeksoft.searchlib.schema.Schema;
import com.jaeksoft.searchlib.schema.SchemaField;
import com.jaeksoft.searchlib.schema.Stored;
import com.jaeksoft.searchlib.schema.TermVector;
import com.jaeksoft.searchlib.util.InfoCallback;
import com.jaeksoft.searchlib.util.JsonUtils;
import com.jaeksoft.searchlib.util.ReadWriteLock;
import com.jaeksoft.searchlib.util.map.TargetField;

public class StandardLearner implements LearnerInterface {

	private static final String FIELD_SOURCE_DATA = "data";
	private static final String FIELD_SOURCE_TARGET = "target";
	private static final String FIELD_SOURCE_NAME = "name";
	private static final String FIELD_SOURCE_CUSTOM = "custom";

	private static final String[] SOURCE_FIELDS = { FIELD_SOURCE_DATA,
			FIELD_SOURCE_TARGET, FIELD_SOURCE_NAME, FIELD_SOURCE_CUSTOM };

	private static final String FIELD_TARGET_LABEL = StringUtils.EMPTY;
	private static final String FIELD_TARGET_SCORE = StringUtils.EMPTY;

	private final String[] TARGET_FIELDS = { FIELD_TARGET_LABEL,
			FIELD_TARGET_SCORE };

	private final String REQUEST_SEARCH = "search";
	private final String REQUEST_CUSTOM = "custom";

	private final ReadWriteLock rwl = new ReadWriteLock();

	private File indexDir = null;

	private Client learnerClient = null;

	@Override
	public void init(File instancesDir) throws SearchLibException {
		rwl.r.lock();
		try {
			if (instancesDir != null && indexDir != null)
				if (instancesDir.compareTo(indexDir) == 0)
					return;
		} finally {
			rwl.r.unlock();
		}
		rwl.w.lock();
		try {
			closeNoLock();
			this.indexDir = instancesDir;
		} finally {
			rwl.w.unlock();
		}
	}

	private Collection checkDataFields(
			final FieldMap sourceFieldMap) throws SearchLibException,
			IOException {
		if (learnerClient == null || sourceFieldMap == null)
			return null;
		Schema schema = learnerClient.getSchema();
		TreeMap boostMap = new TreeMap();
		sourceFieldMap.populateBoosts(FIELD_SOURCE_DATA, boostMap);
		for (TargetField targetField : boostMap.values()) {
			String fieldName = targetField.getBoostedName();
			if (schema.getField(fieldName) != null)
				continue;
			schema.setField(fieldName, Stored.NO, Indexed.YES, TermVector.YES,
					"StandardAnalyzer");
		}
		return boostMap.values();
	}

	private Collection checkIndex(FieldMap sourceFieldMap)
			throws SearchLibException, IOException {
		if (learnerClient != null)
			return checkDataFields(sourceFieldMap);
		if (indexDir == null)
			throw new SearchLibException("Index directory not set");
		if (!indexDir.exists())
			indexDir.mkdir();
		learnerClient = new Client(indexDir,
				"/com/jaeksoft/searchlib/learner_config.xml", true);
		return checkDataFields(sourceFieldMap);
	}

	private void closeNoLock() {
		if (learnerClient == null)
			return;
		learnerClient.close();
		learnerClient = null;
	}

	@Override
	public void close() {
		rwl.w.lock();
		try {
			closeNoLock();
		} finally {
			rwl.w.unlock();
		}
	}

	@Override
	public void reset() throws SearchLibException {
		rwl.w.lock();
		try {
			if (learnerClient != null) {
				learnerClient.deleteAll();
				learnerClient.close();
				learnerClient.delete();
				learnerClient = null;
			}
		} catch (IOException e) {
			throw new SearchLibException(e);
		} finally {
			rwl.w.unlock();
		}
	}

	@Override
	public long getDocumentCount() throws IOException, SearchLibException {
		rwl.r.lock();
		try {
			if (learnerClient == null)
				checkIndex(null);
			if (learnerClient == null)
				return 0;
			return learnerClient.getStatistics().getNumDocs();
		} finally {
			rwl.r.unlock();
		}
	}

	@Override
	public void learn(Client client, String requestName,
			Collection sources, FieldMap sourceFieldMap)
			throws IOException, SearchLibException {
		if (CollectionUtils.isEmpty(sources))
			return;
		AbstractResultSearch result = null;
		List learnIndexDocuments = new ArrayList(
				sources.size());
		rwl.r.lock();
		try {
			checkIndex(sourceFieldMap);
			String uniqueField = client.getSchema().getUniqueField();
			if (StringUtils.isEmpty(uniqueField))
				return;
			for (IndexDocument source : sources) {
				AbstractSearchRequest request = (AbstractSearchRequest) client
						.getNewRequest(requestName);
				request.setStart(0);
				request.setRows(1);
				request.setEmptyReturnsAll(true);
				String uniqueTerm = source.getFieldValueString(uniqueField, 0);
				if (StringUtils.isEmpty(uniqueTerm))
					continue;
				request.addTermFilter(uniqueField, uniqueTerm, false);
				result = (AbstractResultSearch) client.request(request);
				if (result.getDocumentCount() != 1)
					continue;
				addNewlearnDocument(sourceFieldMap, result, 0,
						learnIndexDocuments);
			}
		} finally {
			rwl.r.unlock();
		}
		rwl.w.lock();
		try {
			learnerClient.updateDocuments(learnIndexDocuments);
		} finally {
			rwl.w.unlock();
		}
	}

	@Override
	public void remove(Client client, String searchRequest, String field,
			Collection values, FieldMap sourceFieldMap)
			throws SearchLibException {
		rwl.r.lock();
		try {
			checkIndex(sourceFieldMap);
			String uniqueField = client.getSchema().getUniqueField();
			if (StringUtils.isEmpty(uniqueField))
				return;
			if (!sourceFieldMap.isMapped(field, FIELD_SOURCE_NAME))
				return;
		} catch (IOException e) {
			throw new SearchLibException(e);
		} finally {
			rwl.r.unlock();
		}
		rwl.w.lock();
		try {
			learnerClient.deleteDocuments(FIELD_SOURCE_NAME, values);
		} finally {
			rwl.w.unlock();
		}
	}

	private BooleanQuery getBooleanQuery(String fieldName, String data)
			throws IOException, SearchLibException {
		BooleanQuery booleanQuery = new BooleanQuery();
		Schema schema = learnerClient.getSchema();
		SchemaField schemaField = schema.getFieldList().get(fieldName);
		Analyzer analyzer = schema.getAnalyzer(schemaField,
				LanguageEnum.UNDEFINED);
		analyzer.getQueryAnalyzer().toBooleanQuery(fieldName, data,
				booleanQuery, Occur.SHOULD);
		return booleanQuery;
	}

	@Override
	public void similar(String data, FieldMap sourceFieldMap, int maxRank,
			double minScore, Collection collector)
			throws IOException, SearchLibException {
		rwl.r.lock();
		try {
			checkIndex(sourceFieldMap);
			BooleanQuery booleanQuery = getBooleanQuery(FIELD_SOURCE_DATA, data);
			if (booleanQuery == null || booleanQuery.getClauses() == null
					|| booleanQuery.getClauses().length == 0)
				return;
			AbstractLocalSearchRequest searchRequest = (AbstractLocalSearchRequest) learnerClient
					.getNewRequest(REQUEST_SEARCH);
			int start = 0;
			final int rows = 1000;
			List termVectors = new ArrayList(rows);
			List stringIndexTerms = new ArrayList(rows);
			for (;;) {
				termVectors.clear();
				stringIndexTerms.clear();
				searchRequest.setStart(start);
				searchRequest.setRows(rows);
				searchRequest.setBoostedComplexQuery(booleanQuery);
				AbstractResultSearch result = (AbstractResultSearch) learnerClient
						.request(searchRequest);
				if (result.getDocumentCount() == 0)
					break;
				int end = start + result.getDocumentCount();
				int[] docIds = ArrayUtils.subarray(
						ResultDocument.getDocIds(result.getDocs()), start, end);
				learnerClient.getIndex().putTermVectors(docIds,
						FIELD_SOURCE_TARGET, termVectors);
				learnerClient.getIndex().getStringIndex(FIELD_SOURCE_NAME)
						.putTerms(docIds, stringIndexTerms);
				int i = -1;
				for (int pos = start; pos < end; pos++) {
					i++;
					String[] terms = termVectors.get(i);
					if (terms == null)
						continue;
					double docScore = result.getScore(pos);
					if (docScore < minScore)
						break;
					String name = stringIndexTerms.get(i);
					if (name == null)
						continue;
					for (String value : terms) {
						if (value == null)
							continue;
						collector.add(new LearnerResultItem(docScore, pos,
								null, value, 1, null));
						break;
					}
				}
				searchRequest.reset();
				start += rows;
			}
		} finally {
			rwl.r.unlock();
		}
	}

	private void fieldClassify(String fieldName, Float boost, String data,
			TreeMap targetMap)
			throws SearchLibException, IOException {
		AbstractLocalSearchRequest searchRequest = (AbstractLocalSearchRequest) learnerClient
				.getNewRequest(REQUEST_SEARCH);
		BooleanQuery booleanQuery = getBooleanQuery(fieldName, data);
		if (booleanQuery == null || booleanQuery.getClauses() == null
				|| booleanQuery.getClauses().length == 0)
			return;
		int start = 0;
		final int rows = 1000;
		List termVectors = new ArrayList(rows);
		List stringIndexTerms = new ArrayList(rows);
		for (;;) {
			termVectors.clear();
			stringIndexTerms.clear();
			searchRequest.setStart(start);
			searchRequest.setRows(rows);
			searchRequest.setBoostedComplexQuery(booleanQuery);
			ResultSearchSingle result = (ResultSearchSingle) learnerClient
					.request(searchRequest);
			if (result.getDocumentCount() == 0)
				break;
			int end = start + result.getDocumentCount();
			int[] docIds = ArrayUtils.subarray(
					ResultDocument.getDocIds(result.getDocs()), start, end);
			learnerClient.getIndex().putTermVectors(docIds,
					FIELD_SOURCE_TARGET, termVectors);
			learnerClient.getIndex().getStringIndex(FIELD_SOURCE_NAME)
					.putTerms(docIds, stringIndexTerms);
			int i = -1;
			for (int pos = start; pos < end; pos++) {
				i++;
				String[] terms = termVectors.get(i);
				if (terms == null)
					continue;
				double docScore = result.getScore(pos);
				if (boost != null)
					docScore = docScore * boost;
				String name = stringIndexTerms.get(i);
				for (String value : terms) {
					if (value == null)
						continue;
					LearnerResultItem learnerResultItem = targetMap.get(value);
					if (learnerResultItem == null) {
						learnerResultItem = new LearnerResultItem(0, -1, value,
								null, 0, null);
						targetMap.put(value, learnerResultItem);
					}
					learnerResultItem.addScoreInstance(docScore, 1, name);
				}
			}
			searchRequest.reset();
			start += rows;
		}
	}

	@Override
	public void classify(String data, FieldMap sourceFieldMap, int maxRank,
			double minScore, Collection collector)
			throws IOException, SearchLibException {
		rwl.r.lock();
		try {
			Collection targetFields = checkIndex(sourceFieldMap);
			TreeMap targetMap = new TreeMap();
			fieldClassify(FIELD_SOURCE_DATA, null, data, targetMap);
			for (TargetField targetField : targetFields)
				fieldClassify(targetField.getBoostedName(),
						targetField.getBoost(), data, targetMap);
			for (LearnerResultItem learnerResultItem : targetMap.values()) {
				learnerResultItem.score = learnerResultItem.score
						/ learnerResultItem.count
						* Math.log1p(learnerResultItem.count);
				if (learnerResultItem.score > minScore)
					collector.add(learnerResultItem);
			}
		} finally {
			rwl.r.unlock();
		}
	}

	private void addNewlearnDocument(FieldMap sourceFieldMap,
			AbstractResultSearch result, int pos,
			Collection learnIndexDocuments) throws IOException,
			SearchLibException {
		ResultDocument resultDocument = result.getDocument(pos);
		if (resultDocument == null)
			return;
		IndexDocument target = new IndexDocument();
		sourceFieldMap.mapIndexDocument(resultDocument, target);
		sourceFieldMap.mapIndexDocumentJson(FIELD_SOURCE_CUSTOM,
				resultDocument, target);
		List joinResultDocuments = result.getJoinDocumentList(
				pos, null);
		if (joinResultDocuments != null)
			for (ResultDocument joinResultDocument : joinResultDocuments)
				sourceFieldMap.mapIndexDocument(joinResultDocument, target);
		learnIndexDocuments.add(target);
	}

	@Override
	public void learn(Client client, String requestName,
			FieldMap sourceFieldMap, final int buffer, InfoCallback infoCallback)
			throws SearchLibException, IOException {
		rwl.w.lock();
		try {
			checkIndex(sourceFieldMap);
			int count = 0;
			learnerClient.deleteAll();
			AbstractSearchRequest request = (AbstractSearchRequest) client
					.getNewRequest(requestName);
			int start = 0;
			List indexDocumentList = new ArrayList(
					buffer);
			request.setRows(buffer);
			request.setEmptyReturnsAll(true);
			for (;;) {
				request.setStart(start);
				AbstractResultSearch result = (AbstractResultSearch) client
						.request(request);
				if (result.getDocumentCount() == 0)
					break;
				for (int i = 0; i < result.getDocumentCount(); i++)
					addNewlearnDocument(sourceFieldMap, result, start + i,
							indexDocumentList);
				learnerClient.updateDocuments(indexDocumentList);
				count += indexDocumentList.size();
				indexDocumentList.clear();
				if (infoCallback != null)
					infoCallback.setInfo(count + " document(s) learned.");
				request.reset();
				start += buffer;
			}
		} finally {
			rwl.w.unlock();
		}
	}

	@Override
	public TreeMap> getCustoms(String name)
			throws SearchLibException {
		AbstractSearchRequest searchRequest = (AbstractSearchRequest) learnerClient
				.getNewRequest(REQUEST_CUSTOM);
		searchRequest.setQueryString(name);
		AbstractResultSearch result = (AbstractResultSearch) learnerClient
				.request(searchRequest);
		if (result.getDocumentCount() == 0)
			return null;
		ResultDocument doc = result.getDocument(0);
		String json = doc.getValueContent(FIELD_SOURCE_CUSTOM, 0);
		if (StringUtils.isEmpty(json))
			return null;
		try {
			return JsonUtils.getObject(json,
					JsonUtils.MapStringListStringTypeRef);
		} catch (JsonParseException e) {
			throw new SearchLibException(e);
		} catch (JsonMappingException e) {
			throw new SearchLibException(e);
		} catch (IOException e) {
			throw new SearchLibException(e);
		}
	}

	@Override
	public String[] getSourceFieldList() {
		return SOURCE_FIELDS;
	}

	@Override
	public String[] getTargetFieldList() {
		return TARGET_FIELDS;
	}

}