hivemall.ftvec.text.TermFrequencyUDAF Maven / Gradle / Ivy
The newest version!
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package hivemall.ftvec.text;
import hivemall.utils.lang.mutable.MutableInt;
import java.util.HashMap;
import java.util.Map;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDAF;
import org.apache.hadoop.hive.ql.exec.UDAFEvaluator;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.Text;
@SuppressWarnings("deprecation")
@Description(name = "tf",
value = "_FUNC_(string text) - Return a term frequency in ")
public final class TermFrequencyUDAF extends UDAF {
public static class Evaluator implements UDAFEvaluator {
public static class PartialResult {
private final Map map;
private long globalCount;
public PartialResult() {
this.map = new HashMap();
this.globalCount = 0L;
}
}
private PartialResult partial;
@Override
public void init() {
this.partial = null;
}
public boolean iterate(Text term) {
if (term == null) {
return true;
}
if (partial == null) {
this.partial = new PartialResult();
partial.map.put(new Text(term), new MutableInt(1));
} else {
final Map map = partial.map;
MutableInt count = map.get(term);
if (count == null) {
map.put(new Text(term), new MutableInt(1));
} else {
int newcount = count.getValue() + 1;
count.setValue(newcount);
}
}
partial.globalCount++;
return true;
}
public PartialResult terminatePartial() {
return partial;
}
public boolean merge(PartialResult other) {
if (other == null) {
return true;
}
if (partial == null) {
this.partial = new PartialResult();
}
final Map this_map = partial.map;
final Map other_map = other.map;
for (Map.Entry e : other_map.entrySet()) {
Text term = e.getKey();
MutableInt other_count = e.getValue();
MutableInt this_count = this_map.get(term);
if (this_count == null) {
this_map.put(term, other_count);
} else {
int newcount = this_count.getValue() + other_count.getValue();
this_count.setValue(newcount);
}
}
partial.globalCount += other.globalCount;
return true;
}
public Map terminate() {
if (partial == null) {
return null;
}
final long globalCount = partial.globalCount;
final Map tfmap = new HashMap();
for (Map.Entry e : partial.map.entrySet()) {
Text term = e.getKey();
float other_count = e.getValue().getValue();
float freq = other_count / globalCount;
tfmap.put(term, new FloatWritable(freq));
}
this.partial = null;
return tfmap;
}
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy