hex.word2vec.WordCountTask Maven / Gradle / Ivy
package hex.word2vec;
import water.AutoBuffer;
import water.MRTask;
import water.fvec.Chunk;
import water.parser.BufferedString;
import water.util.IcedHashMap;
import water.util.IcedLong;
import java.util.HashMap;
/**
* Reduce a string column of a given Vec to a set of unique words
* and their frequency counts
*
* Currently the array is consolidated on the calling node. Given
* the limited vocabulary size of most languages, the resulting
* array is presumed to easily fit in memory.
*/
public class WordCountTask extends MRTask {
// OUT
IcedHashMap _counts;
WordCountTask() {}
@Override
public void map(Chunk cs) {
_counts = new IcedHashMap<>();
for (int i = 0; i < cs._len; i++) {
if (cs.isNA(i)) continue;
BufferedString str = cs.atStr(new BufferedString(), i);
IcedLong count = _counts.get(str);
if (count != null)
count._val++;
else
_counts.put(str, new IcedLong(1));
}
}
@Override
public void reduce(WordCountTask other) {
assert _counts != null;
assert other._counts != null;
for (BufferedString str : other._counts.keySet()) {
IcedLong myCount = _counts.get(str);
if (myCount == null)
_counts.put(str, other._counts.get(str));
else
myCount._val += other._counts.get(str)._val;
}
}
public final AutoBuffer write_impl(AutoBuffer ab) {
if( _counts != null )
for (BufferedString key : _counts.keySet())
ab.put2((char)key.length()).putA1(key.getBuffer(), key.getOffset(), key.getOffset() + key.length())
.put8(_counts.get(key)._val);
return ab.put2((char)65535); // End of map marker
}
public final WordCountTask read_impl(AutoBuffer ab) {
_counts = new IcedHashMap<>();
int len;
while ((len = ab.get2()) != 65535) { // Read until end-of-map marker
byte[] bs = ab.getA1(len);
long cnt = ab.get8();
_counts.put(new BufferedString(new String(bs)), new IcedLong(cnt));
}
return this;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy