All Downloads are FREE. Search and download functionalities are using the official Maven repository.

net.anotheria.anodoc.query2.string.ContainsWordsQuery Maven / Gradle / Ivy

The newest version!
package net.anotheria.anodoc.query2.string;

import net.anotheria.anodoc.data.Document;
import net.anotheria.anodoc.data.Property;
import net.anotheria.anodoc.data.StringProperty;
import net.anotheria.anodoc.query2.DocumentQuery;
import net.anotheria.anodoc.query2.QueryResultEntry;
import net.anotheria.asg.data.DataObject;
import net.anotheria.util.IOUtils;
import net.anotheria.util.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;


/**
 * 

ContainsWordsQuery class.

* * @author another * @version $Id: $Id */ public class ContainsWordsQuery implements DocumentQuery{ /** * {@link Logger} instance. */ private static final Logger LOGGER = LoggerFactory.getLogger(ContainsWordsQuery.class); public static class Index{ Map index = new HashMap(); int size = 0; public void indexWord(String word, int position){ if(index.containsKey(word)) index.get(word).addOccurrence(position); else index.put(word, new Occurrenece(position)); size = Math.max(size, position); } public boolean containsWord(String word){ return index.containsKey(word); } public Occurrenece getWordOccuerrence(String word){ return index.get(word); } public String rebuildText(){ StringBuilder builder = new StringBuilder(); for(int i = 1; i <= size; i++){ for(String word: index.keySet()){ Occurrenece occurrenece = index.get(word); if(occurrenece.containsPosistion(i)) builder.append(word).append(" "); } } return builder.toString(); } @Override public String toString() { return "Index [size=" + size + ", index=" + index + "]"; } } private static class Occurrenece{ private Set positions = new HashSet(); public Occurrenece(int aFirstPosition){ positions.add(aFirstPosition); } public void addOccurrence(int position){ positions.add(position); } public boolean containsPosistion(int position){ return positions.contains(position); } @Override public String toString() { return "Occurrenece [positions=" + positions + "]"; } } /** Constant WORDS_SEPARATORS */ public static final Map WORDS_SEPARATORS = new HashMap(); static{ WORDS_SEPARATORS.put("\n", " "); WORDS_SEPARATORS.put("\t", " "); WORDS_SEPARATORS.put(".", " "); WORDS_SEPARATORS.put(",", " "); WORDS_SEPARATORS.put("!", " "); WORDS_SEPARATORS.put("?", " "); WORDS_SEPARATORS.put("-", " "); WORDS_SEPARATORS.put("_", " "); } private String[] criteria; private Set propertiesToSearch = Collections.emptySet(); /** *

Constructor for ContainsWordsQuery.

* * @param aCriteria a {@link java.lang.String} object. */ public ContainsWordsQuery(String aCriteria){ this(aCriteria, new String[]{}); } /** *

Constructor for ContainsWordsQuery.

* * @param aCriteria a {@link java.lang.String} object. * @param aPropertiesToSearch a {@link java.lang.String} object. */ public ContainsWordsQuery(String aCriteria, String... aPropertiesToSearch){ aCriteria = aCriteria.toLowerCase().trim(); aCriteria = StringUtils.replace(aCriteria, WORDS_SEPARATORS); criteria = StringUtils.tokenize(aCriteria,' '); propertiesToSearch = new HashSet(); for(String prop: aPropertiesToSearch) propertiesToSearch.add(prop); } /** {@inheritDoc} */ public List match(DataObject obj) { LOGGER.debug("Match DataObject " + obj.getDefinedName() + " with ID " + obj.getId() + "." + this); if (!(obj instanceof Document)) throw new AssertionError("Supports only search in a Document instance!"); return match((Document)obj); } /** *

match.

* * @param doc a {@link net.anotheria.anodoc.data.Document} object. * @return a {@link java.util.List} object. */ public List match(Document doc) { List ret = new ArrayList(); List properties = doc.getProperties(); Set matchRegression = new HashSet(Arrays.asList(criteria)); int i = 0; for (Property p: properties){ i++; Index propertyIndex = buildPropertyIndex(p); Set toRemove = new HashSet(); for(String match: matchRegression) if(propertyIndex.containsWord(match)) toRemove.add(match); matchRegression.removeAll(toRemove); if(matchRegression.isEmpty()){ QueryResultEntry res = new QueryResultEntry(); res.setMatchedDocument(doc); res.setMatchedProperty(p); //TODO: sophisticated relevance calculation res.setRelevance(100/i); ret.add(res); return ret; } } return ret; } private Index buildPropertyIndex(Property p){ Index ret = new Index(); String valueStr = p.getValue().toString().toLowerCase().trim(); if(StringUtils.isEmpty(valueStr)) return ret; valueStr = StringUtils.removeChar(valueStr, '\r'); valueStr = StringUtils.replace(valueStr, WORDS_SEPARATORS); String[] wordTokens = StringUtils.tokenize(valueStr, true, ' '); for(int i = 0; i < wordTokens.length; i++){ String word = wordTokens[i]; ret.indexWord(word, i + 1); } return ret; } /** {@inheritDoc} */ @Override public String toString() { return "ContainsAllQuery [criteria=" + Arrays.toString(criteria) + ", propertiesToSearch=" + propertiesToSearch + "]"; } //TODO: write Unit Tests /** *

main.

* * @param args an array of {@link java.lang.String} objects. * @throws java.lang.Exception if any. */ public static void main(String[] args) throws Exception{ ContainsWordsQuery query = new ContainsWordsQuery("search query indexing matrix Ноутбук"); Document doc = new Document("sample"); String sample = IOUtils.readInputStreamBufferedAsString(query.getClass().getClassLoader().getResourceAsStream("sampletext.txt"), "utf-8"); StringProperty p = new StringProperty("sampleProp1"); p.setValue(sample); doc.putStringProperty(p); sample = IOUtils.readInputStreamBufferedAsString(query.getClass().getClassLoader().getResourceAsStream("sampletext2.txt"), "utf-8"); p = new StringProperty("sampleProp2"); p.setValue(sample); doc.putStringProperty(p); p = new StringProperty("sampleProp3"); p.setValue(null); doc.putStringProperty(p); System.out.println("** RESULT: **"); List result = query.match(doc); System.out.println(result); } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy