com.olapdb.obase.utils.DocUtil Maven / Gradle / Ivy
The newest version!
package com.olapdb.obase.utils;
import com.olapdb.obase.data.Bytez;
import com.olapdb.obase.data.SearchableEntity;
import com.olapdb.obase.data.Tag;
import com.olapdb.obase.data.index.*;
import org.apache.hadoop.hbase.client.*;
import org.apache.hadoop.hbase.util.Bytes;
import java.util.*;
public class DocUtil {
public static List searchWords(Class entityClass, String words){
Indexer indexer = new Indexer(entityClass);
List sws = Util.searchList(words);
for(String w : sws)indexer.addTag("word", Value.from(w));
List elites = indexer.list();
if(elites == null || elites.isEmpty()){
indexer = new Indexer(entityClass);
sws = Util.searchListSmart(words);
for(String w : sws)indexer.addTag("word", Value.from(w));
elites = indexer.list();
}
return elites;
}
public static void sortWithWords(Table hostTable, List elites, int maxCount){
long docCount = Indexer.getRowCount(hostTable);
for(Elite e : elites){
int total = 1;
for(Find f : e.getMatchs()){
if(f.getData()!=null){
total = Math.max(total, Bytez.toInt(f.getData(), 4));
}
}
for(Find f : e.getMatchs()){
if(f.getIdx() !=null && f.getData()!=null){
double weight = 0.01 + Math.log(docCount*1.0/ f.getIdx().getReference());
e.setScore(e.getScore() + weight* Bytez.toInt(f.getData())/total);
}
}
}
Collections.sort(elites, new Comparator(){
@Override
public int compare(Elite arg0, Elite arg1) {
if(arg0.getScore() > arg1.getScore())
return -1;
if(arg0.getScore() < arg1.getScore())
return 1;
return Bytez.compareTo(arg0.getRow(),arg1.getRow());
}
});
while(elites.size() > maxCount){
elites.remove(elites.size()-1);
}
}
public static void sortWithTags(Table hostTable, List elites, List tags, int maxCount){
Map map = new TreeMap(Bytes.BYTES_COMPARATOR);
List rowBytes = new Vector();
for(Elite v : elites){
rowBytes.add(v.getRow());
map.put(v.getRow(), v);
}
rowBytes = new Recommender(hostTable, rowBytes).similar(tags).top(maxCount);
elites.clear();
for(byte[] row : rowBytes){
elites.add(map.get(row));
}
}
public static Hashtable replaceContent(String oldStr, String newStr) {
try{
Hashtable news = Fenci.text2words(newStr);
Hashtable olds = Fenci.text2words(oldStr);
return subWords(news, olds);
}catch(Exception e){
e.printStackTrace();
return new Hashtable();
}
}
public static Hashtable subWords(Hashtable news, Hashtable olds){
for(String word : olds.keySet()){
Integer value = news.getOrDefault(word, 0);
int count = value - olds.get(word);
if(count == 0){
news.remove(word);
}else{
news.put(word, count);
}
}
return news;
}
public static Hashtable addWords(Hashtable news, Hashtable olds){
for(String word : olds.keySet()){
Integer value = news.getOrDefault(word, 0);
int count = value + olds.get(word);
if(count == 0){
news.remove(word);
}else{
news.put(word, count);
}
}
return news;
}
public static Hashtable mulWords(Hashtable olds, int factor){
for(String word : olds.keySet()){
olds.put(word, olds.get(word)*factor);
}
return olds;
}
public static Hashtable updateTextIndex(SearchableEntity doc, String oldText, String newText, int weight){
Hashtable dif = DocUtil.replaceContent(oldText, newText);
dif = DocUtil.mulWords(dif, weight);
try {
DocUtil.adjustDocWords(doc, "word", dif);
} catch (Exception e) {
e.printStackTrace();
}
return dif;
}
public static void adjustDocWords(SearchableEntity doc, String column, Hashtable fix) throws Exception{
int wordCount = 0;
int count = fix.size();
List words = new Vector();
words.addAll(fix.keySet());
Idc idc= Idc.getInstance(doc.getBasisTable(), column);
idc.connect();
// Lunnar lunnar = new Lunnar(5);
// lunnar.submit(new Runnable(){
// @Override
// public void run() {
// Result[] results = BSL.getIdxTable().get(idxGets);
// int newIdxCount = 0;
// for(Result r : results){
// if(r.isEmpty())newIdxCount ++ ;
// }
// }
// });
WordInfo[] wordArray = new WordInfo[count];
//1. 整理
List idxGets = new Vector();
List indexGets = new Vector();
for(int i=0; i idxPuts = new Vector();
List indexPuts = new Vector();
List idxDeletes = new Vector();
List indexDeletes = new Vector();
for(int i=0; i 0){
if(wi.idx!=null && wi.idx.getReference() == wi.ref){
}else{
Put put = new Put(wi.idxRow);
put.addColumn(Bytez.from(Obase.FAMILY_ATTR), Bytez.from(Obase.COLUMN_REFERENCE), Bytez.from(wi.ref));
idxPuts.add(put);
}
}
}
//4. 更新
doc.setWordCount(wordCount);
if(!indexPuts.isEmpty())
Obase.getIndexTable().put(indexPuts);
if(!indexDeletes.isEmpty())
Obase.getIndexTable().delete(indexDeletes);
if(!idxPuts.isEmpty())
Obase.getIdxTable().put(idxPuts);
if(!idxDeletes.isEmpty())
Obase.getIdxTable().delete(idxDeletes);
}
private final static class WordInfo{
long ref;
Idx idx;
byte[] idxRow;
int count;
byte[] indexRow;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy