edu.emory.mathcs.nlp.zzz.Radiology Maven / Gradle / Ivy
The newest version!
/**
* Copyright 2015, Emory University
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package edu.emory.mathcs.nlp.zzz;
import edu.emory.mathcs.nlp.common.util.CharUtils;
import edu.emory.mathcs.nlp.common.util.IOUtils;
import edu.emory.mathcs.nlp.common.util.MathUtils;
import edu.emory.mathcs.nlp.common.util.Splitter;
import edu.emory.mathcs.nlp.common.util.StringUtils;
import edu.emory.mathcs.nlp.learning.optimization.OnlineOptimizer;
import edu.emory.mathcs.nlp.learning.optimization.method.AdaGrad;
import edu.emory.mathcs.nlp.learning.util.FeatureMap;
import edu.emory.mathcs.nlp.learning.util.Instance;
import edu.emory.mathcs.nlp.learning.util.MLUtils;
import edu.emory.mathcs.nlp.learning.util.SparseVector;
import edu.emory.mathcs.nlp.learning.util.WeightVector;
import java.io.BufferedReader;
import java.io.ObjectInputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
/**
* @author Jinho D. Choi ({@code [email protected]})
*/
public class Radiology
{
FeatureMap feature_map;
public Radiology(String[] args) throws Exception
{
final String STOP_WORDS = "/Users/jdchoi/Documents/EmoryNLP/english-models/src/main/resources/edu/emory/mathcs/nlp/english/lexica/en-stop-words-simplified-uncapitalized.xz";
final String TSV_FILE = "/Users/jdchoi/Emory/radiology/dat/radiology_report_151112_falgun.tsv";
final int COL = 6;
Set stopwords = getStopWordSet(STOP_WORDS);
List trn, dev;
double bestScore = 0, score, avg;
int correct = 0;
avg = 0;
for (int i=0; i<10; i++)
{
feature_map = new FeatureMap();
List instances = getInstances(TSV_FILE, COL, stopwords);
WeightVector w = new WeightVector();
OnlineOptimizer optimizer = new AdaGrad(w, 0.001f, 0f);
Collections.shuffle(instances);
trn = instances.subList(0, 400);
dev = instances.subList(400, 500);
bestScore = 0;
for (int j=0; j<20; j++)
{
for (Instance instance : trn)
optimizer.train(instance);
correct = 0;
for (Instance instance : dev)
{
float[] scores = optimizer.scores(instance.getFeatureVector());
String y = optimizer.getLabel(MLUtils.argmax(scores));
if (instance.isStringLabel(y)) correct++;
}
score = MathUtils.accuracy(correct, dev.size());
if (score > bestScore) bestScore = score;
}
System.out.printf("%5.2f\n", bestScore);
avg += bestScore;
}
System.out.printf("Avg: %5.2f\n", avg/10);
}
@SuppressWarnings("unchecked")
Set getStopWordSet(String filename) throws Exception
{
ObjectInputStream in = IOUtils.createObjectXZBufferedInputStream(filename);
Set set = (Set)in.readObject();
return set;
}
List getInstances(String filename, int index, Set stopwords) throws Exception
{
BufferedReader reader = IOUtils.createBufferedReader(filename);
List instances = new ArrayList<>();
Set set;
String line;
String[] t;
while ((line = reader.readLine()) != null)
{
t = Splitter.splitTabs(line);
set = toSet(t[1], stopwords);
instances.add(new Instance(t[index], toSparseVector(set)));
}
return instances;
}
SparseVector toSparseVector(Set set)
{
SparseVector vector = new SparseVector();
for (String s : set)
vector.add(feature_map.add(0, s));
vector.sort();
return vector;
}
Set toSet(String s, Set stopwords)
{
Set set = new HashSet<>();
for (String t : Splitter.splitSpace(s))
{
t = toWord(t);
if (!t.isEmpty() && !stopwords.contains(t))
set.add(t);
}
return set;
}
String toWord(String t)
{
StringBuilder build = new StringBuilder();
for (char c : t.toCharArray())
{
if (!CharUtils.isPunctuation(c))
build.append(c);
}
return StringUtils.toSimplifiedForm(build.toString(), true);
}
static public void main(String[] args) throws Exception
{
new Radiology(args);
}
}
© 2015 - 2024 Weber Informatics LLC | Privacy Policy