All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.olapdb.obase.utils.DocUtil Maven / Gradle / Ivy

The newest version!
package com.olapdb.obase.utils;

import com.olapdb.obase.data.Bytez;
import com.olapdb.obase.data.SearchableEntity;
import com.olapdb.obase.data.Tag;
import com.olapdb.obase.data.index.*;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.util.Bytes;

import java.util.*;

public class DocUtil {
	public static List searchWords(Class entityClass, String words){
		Indexer indexer = new Indexer(entityClass);

		List sws = Util.searchList(words);
		for(String w : sws)indexer.addTag("word", Value.from(w));
		List elites = indexer.list();
		if(elites == null || elites.isEmpty()){
			indexer = new Indexer(entityClass);
			sws = Util.searchListSmart(words);
			for(String w : sws)indexer.addTag("word", Value.from(w));
			elites = indexer.list();
		}

		return elites;
	}

	public static void sortWithWords(Table hostTable, List elites, int maxCount){
		long docCount = Indexer.getRowCount(hostTable);
		for(Elite e : elites){
			int total = 1;
			for(Find f : e.getMatchs()){
				if(f.getData()!=null){
					total = Math.max(total, Bytez.toInt(f.getData(), 4));
				}
			}

			for(Find f : e.getMatchs()){
				if(f.getIdx() !=null && f.getData()!=null){
					double weight = 0.01 + Math.log(docCount*1.0/ f.getIdx().getReference());
					e.setScore(e.getScore() + weight* Bytez.toInt(f.getData())/total);
				}
			}
		}

		Collections.sort(elites, new Comparator(){
			@Override
			public int compare(Elite arg0, Elite arg1) {
				if(arg0.getScore() > arg1.getScore())
					return -1;
				if(arg0.getScore() < arg1.getScore())
					return 1;

				return Bytez.compareTo(arg0.getRow(),arg1.getRow());
			}
		});

		while(elites.size() > maxCount){
			elites.remove(elites.size()-1);
		}
	}

	public static void sortWithTags(Table hostTable, List elites, List tags, int maxCount){
		Map map = new TreeMap(Bytes.BYTES_COMPARATOR);

		List rowBytes = new Vector();
		for(Elite v : elites){
			rowBytes.add(v.getRow());
			map.put(v.getRow(), v);
		}
		rowBytes = new Recommender(hostTable, rowBytes).similar(tags).top(maxCount);

		elites.clear();
		for(byte[] row : rowBytes){
			elites.add(map.get(row));
		}
	}


	public static  Hashtable replaceContent(String oldStr, String newStr) {
		try{
			Hashtable news = Fenci.text2words(newStr);
			Hashtable olds = Fenci.text2words(oldStr);

			return subWords(news, olds);
		}catch(Exception e){
			e.printStackTrace();
			return new Hashtable();
		}
	}

	public static  Hashtable subWords(Hashtable news, Hashtable olds){
		for(String word : olds.keySet()){
			Integer value = news.getOrDefault(word, 0);
			int count = value - olds.get(word);
			if(count == 0){
				news.remove(word);
			}else{
				news.put(word, count);
			}
		}
		return news;
	}
	public static  Hashtable addWords(Hashtable news, Hashtable olds){
		for(String word : olds.keySet()){
			Integer value = news.getOrDefault(word, 0);
			int count = value + olds.get(word);
			if(count == 0){
				news.remove(word);
			}else{
				news.put(word, count);
			}
		}
		return news;
	}
	public static Hashtable mulWords(Hashtable olds, int factor){

		for(String word : olds.keySet()){
			olds.put(word, olds.get(word)*factor);
		}
		return olds;
	}

	public static Hashtable updateTextIndex(SearchableEntity doc, String oldText, String newText, int weight){
		Hashtable dif = DocUtil.replaceContent(oldText, newText);
		dif = DocUtil.mulWords(dif, weight);
		try {
			DocUtil.adjustDocWords(doc, "word", dif);
		} catch (Exception e) {
			e.printStackTrace();
		}
		return dif;
	}


	public static void adjustDocWords(SearchableEntity doc, String column, Hashtable fix) throws Exception{
		int wordCount = 0;
		int count = fix.size();

		List words = new Vector();
		words.addAll(fix.keySet());

		Idc idc= Idc.getInstance(doc.getBasisTable(), column);
		idc.connect();

//		Lunnar lunnar = new Lunnar(5);
//		lunnar.submit(new Runnable(){
//			@Override
//			public void run() {
//				Result[] results = BSL.getIdxTable().get(idxGets);
//				int newIdxCount = 0;
//				for(Result r : results){
//					if(r.isEmpty())newIdxCount ++ ;
//				}
//			}
//		});

		WordInfo[] wordArray = new WordInfo[count];

		//1. 整理
		List idxGets = new Vector();
		List indexGets = new Vector();
		for(int i=0; i idxPuts = new Vector();
		List indexPuts = new Vector();
		List idxDeletes = new Vector();
		List indexDeletes = new Vector();

		for(int i=0; i 0){
				if(wi.idx!=null &&  wi.idx.getReference() == wi.ref){

				}else{
					Put put = new Put(wi.idxRow);
					put.addColumn(Bytez.from(Obase.FAMILY_ATTR), Bytez.from(Obase.COLUMN_REFERENCE), Bytez.from(wi.ref));
					idxPuts.add(put);
				}
			}
		}

		//4. 更新
		doc.setWordCount(wordCount);

		if(!indexPuts.isEmpty())
		Obase.getIndexTable().put(indexPuts);
		if(!indexDeletes.isEmpty())
		Obase.getIndexTable().delete(indexDeletes);

		if(!idxPuts.isEmpty())
		Obase.getIdxTable().put(idxPuts);
		if(!idxDeletes.isEmpty())
		Obase.getIdxTable().delete(idxDeletes);
	}

	private final static class WordInfo{
		long ref;
		Idx idx;
		byte[] idxRow;
		int count;
		byte[] indexRow;
	}
}




© 2015 - 2025 Weber Informatics LLC | Privacy Policy