org.fbk.cit.hlt.thewikimachine.util.FreqSet Maven / Gradle / Ivy
package org.fbk.cit.hlt.thewikimachine.util;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import java.io.*;
import java.util.*;
/**
* A set of strings in which each string is associated with
* the frequency countPageCounter
*
* @author Claudio Giuliano
* @version %I%, %G%
* @since 1.0
*/
public class FreqSet {
/**
* Define a static logger variable so that it references the
* Logger instance named FreqSet
.
*/
static Logger logger = Logger.getLogger(FreqSet.class.getName());
protected Map map;
protected int total;
public FreqSet() {
map = new TreeMap();
}
/*public FreqSet(boolean threadSafe) {
if (threadSafe) {
map = Collections.synchronizedMap(new TreeMap());
}
else {
map = new TreeMap();
}
}*/
public int total() {
return total;
}
public void addAll(Set set) {
Iterator it = set.iterator();
while (it.hasNext()) {
add(it.next());
}
}
public int add(String ngram, int count) {
//todo: this is not thread safe
total += count;
Counter c = (Counter) map.get(ngram);
if (c == null) {
map.put(ngram, new Counter(count));
return count;
}
c.inc(count);
return c.count;
}
public int add(String ngram) {
//todo: this is not thread safe
total++;
Counter c = (Counter) map.get(ngram);
if (c == null) {
map.put(ngram, new Counter(1));
return 1;
}
c.inc();
return c.count;
}
public boolean contains(String ngram) {
Counter c = (Counter) map.get(ngram);
if (c == null) {
return false;
}
return true;
} // end contains
//
public Collection values() {
return map.values();
} // end values
public Object[] toArray() {
return map.keySet().toArray();
} // end toArray
//
public Iterator iterator() {
return map.keySet().iterator();
} // end iterator
//
public int get(String ngram) {
//logger.debug("get: " + ngram + ", " + toChar(ngram));
Counter c = (Counter) map.get(ngram);
if (c == null) {
return 0;
}
return c.get();
} // end get
//
public static String toChar(String w) {
StringBuilder sb = new StringBuilder();
int ch = 0;
for (int i = 0; i < w.length(); i++) {
ch = w.charAt(i);
if (i > 0) {
sb.append(" ");
}
sb.append(ch);
}
sb.append("\n");
for (int i = 0; i < w.length(); i++) {
ch = w.charAt(i);
if (i > 0) {
sb.append(" ");
}
sb.append((char) ch);
}
return sb.toString();
} // end toChar
//
public int size() {
return map.size();
} // end size
public void write(Writer out) throws IOException {
Iterator it = map.entrySet().iterator();
//logger.info("writing freq set " + map.entrySet().size());
//logger.info("writing freq set " + map.size());
int i = 0;
while (it.hasNext()) {
Map.Entry entry = (Map.Entry) it.next();
// freq key
out.write(entry.getValue().toString());
out.write("\t");
out.write(entry.getKey().toString());
out.write("\n");
if ((i % 100000) == 0) {
out.flush();
}
} // end while
} // end write
public void write(Writer writer, boolean sort) throws IOException {
SortedMap> sortedMap = toSortedMap();
Iterator it = sortedMap.keySet().iterator();
for (int i = 0; it.hasNext(); i++) {
Integer freq = it.next();
List list = sortedMap.get(freq);
for (int j = 0; j < list.size(); j++) {
writer.write(freq.toString());
writer.write(StringTable.HORIZONTAL_TABULATION);
writer.write(list.get(j));
writer.write(StringTable.LINE_FEED);
}
}
writer.flush();
}
/**
* Reads the frequency set from the specified input stream.
*
* This method processes input in terms of lines. A natural
* line of input is terminated either by a set of line
* terminator characters (\n or \r or \r\n) or by the end
* of the filePageCounter. A natural line may be either a blank line,
* a comment line, or hold some part of a id-feature pair.
* Lines are read from the input stream until end of filePageCounter
* is reached.
*
* A natural line that contains only white space characters
* is considered blank and is ignored. A comment line has
* an ASCII '#' as its first non-white space character;
* comment lines are also ignored and do not encode id-feature
* information.
*
* The id contains all of the characters in the line starting
* with the first non-white space character and up to, but
* not including, the first '\t'. All remaining characters
* on the line become part of the associated feature string;
* if there are no remaining characters, the feature is the
* empty string "".
*
* @param in a Reader
object to
* provide the underlying stream.
* @throws IOException if reading this feature termIndex
* from the specified input stream
* throws an IOException
.
*/
public void read(Reader in) throws IOException {
logger.info("reading vocabulary...");
LineNumberReader lnr = new LineNumberReader(in);
String line;
String[] s;
Integer id;
int count = 0;
while ((line = lnr.readLine()) != null) {
line = line.trim();
//logger.debug(line);
if (!line.startsWith("#")) {
s = line.split("\t");
// token index
//logger.debug(line);
if (s.length == 2) {
/*
SynchronizedCounter c = map.get(s[0]);
if (c == null)
{
map.put(s[0], new SynchronizedCounter(Integer.parseLong(s[1])));
}
else
{
c.inc(Integer.parseLong(s[1]));
}
*/
int freq = Integer.parseInt(s[0]);
total += freq;
//if (freq > 5)
{
//logger.debug("added: " + toChar(s[0]));
map.put(s[1], new Counter(freq));
}
} // end if
} // end if
}
lnr.close();
logger.info(map.size() + " n-grams read");
} // end read
//
public String getMaxValue() {
int max = 0;
String maxs = null;
int f = 0;
String s = null;
Counter c = null;
Iterator it = map.keySet().iterator();
while (it.hasNext()) {
s = it.next();
c = map.get(s);
f = c.get();
if (f > max) {
max = f;
maxs = s;
} // end if
} // end
return maxs;
} // end getMaxValue
//
public SortedMap> toSortedMap() {
SortedMap> smap = new TreeMap>(new Comparator() {
public int compare(Integer e1, Integer e2) {
return e2.compareTo(e1);
}
});
Iterator it = map.keySet().iterator();
while (it.hasNext()) {
String s = it.next();
Counter c = map.get(s);
List list = smap.get(c.get());
if (list == null) {
list = new ArrayList();
list.add(s);
smap.put(c.get(), list);
}
else {
list.add(s);
}
}
return smap;
} // end toSortedMap
//
public String toString(boolean b) {
StringBuilder sb = new StringBuilder();
String s = null;
Counter c = null;
Iterator it = map.keySet().iterator();
for (int i = 0; it.hasNext(); i++) {
if (i > 0) {
sb.append("\t");
}
s = it.next();
c = map.get(s);
sb.append(s);
sb.append("\t");
sb.append(c);
}
return sb.toString();
} // end toString
//
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("(");
Iterator it = map.keySet().iterator();
while (it.hasNext()) {
String s = it.next();
Counter c = map.get(s);
sb.append(s);
sb.append("\t");
sb.append(c);
sb.append(", ");
}
sb.append("...)");
return sb.toString();
} // end toString
class Counter {
int count;
public Counter(int count) {
this.count = count;
}
public void inc() {
count++;
}
public void inc(int l) {
count += l;
}
public int get() {
return count;
}
public String toString() {
return Integer.toString(count);
}
}
public static void main(String args[]) throws Exception {
String logConfig = System.getProperty("log-config");
if (logConfig == null) {
logConfig = "log-config.txt";
}
PropertyConfigurator.configure(logConfig);
if (args.length != 2) {
//logger.info("java -mx1024M org.fbk.irst.tcc.web1t.FreqSet in-ngram-filePageCounter out-ngram-filePageCounter");
logger.info("java -mx1024M org.fbk.irst.tcc.web1t.FreqSet in-ngram-filePageCounter term");
System.exit(-1);
}
FreqSet set = new FreqSet();
//set.read(new FileReader(new File(args[0])));
InputStreamReader reader = new InputStreamReader(new FileInputStream(new File(args[0])), "UTF-8");
set.read(reader);
//logger.info(args[1] + ": " + set.get(args[1]));
//logger.info(args[1] + ": " + set.get("pluralita'"));
//set.write(new FileWriter(new File(args[1])));
OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(new File(args[1])), "ISO-8859-1");
//set.write(new FileWriter(new File(args[1])));
set.write(writer);
}
} // end class FreqSet
© 2015 - 2025 Weber Informatics LLC | Privacy Policy