edu.emory.mathcs.nlp.zzz.CaseCollect Maven / Gradle / Ivy
The newest version!
/**
* Copyright 2015, Emory University
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.emory.mathcs.nlp.zzz;
import edu.emory.mathcs.nlp.common.util.CharUtils;
import edu.emory.mathcs.nlp.common.util.IOUtils;
import edu.emory.mathcs.nlp.common.util.MathUtils;
import edu.emory.mathcs.nlp.common.util.Splitter;
import edu.emory.mathcs.nlp.common.util.StringUtils;
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
/**
* @author Jinho D. Choi ({@code [email protected]})
*/
public class CaseCollect
{
public Map createMap(InputStream in) throws Exception
{
BufferedReader reader = IOUtils.createBufferedReader(in);
Map map = new HashMap<>();
String line, l;
int[] count;
while ((line = reader.readLine()) != null)
{
for (String s : Splitter.splitSpace(line))
{
s = StringUtils.toSimplifiedForm(s);
l = StringUtils.toLowerCase(s);
count = map.computeIfAbsent(l, k -> new int[]{0,0});
if (s.equals(l)) count[0]++;
count[1]++;
}
}
reader.close();
return map;
}
public Set shrink(Map map, int cutoff, double threshold) throws Exception
{
Set set = new HashSet<>();
int[] count;
for (Entry e : map.entrySet())
{
count = e.getValue();
if (count[1] > cutoff && MathUtils.divide(count[0], count[1]) > threshold && isVocab(e.getKey()))
set.add(e.getKey());
}
return set;
}
public boolean isVocab(String s)
{
char[] cs = s.toCharArray();
char c;
if (cs[0] == '#' || cs[0] == '%' || cs[0] == '@' || cs[0] == '(' || CharUtils.containsDigitPunctuationOnly(cs, 0, cs.length))
return false;
for (int i=0; i map = (Map)in.readObject();
Set set = collect.shrink(map, cutoff, threshold);
System.out.println(map.size()+" -> "+set.size());
out.writeObject(set);
out.close();
PrintStream fout = IOUtils.createBufferedPrintStream(OUTPUT_FILE+".txt");
List list = new ArrayList<>(set);
Collections.sort(list);
for (String s : list) fout.println(s);
fout.close();
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy