eu.fbk.utils.lsa.io.TermDocumentMatrixBuilder Maven / Gradle / Ivy
/*
* Copyright (2010) Fondazione Bruno Kessler (FBK)
*
* FBK reserves all rights in the Program as delivered.
* The Program or any portion thereof may not be reproduced
* in any form whatsoever except as provided by license
* without the written consent of FBK. A license under FBK's
* rights in the Program may be available directly from FBK.
*/
package eu.fbk.utils.lsa.io;
import eu.fbk.utils.lsa.Index;
import eu.fbk.utils.lsa.TermSet;
import eu.fbk.utils.lsa.Vocabulary;
import org.apache.log4j.Logger;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Iterator;
import java.util.Map;
import java.util.regex.Pattern;
/**
* This class creates the term-by-document matrix, row index,
* col index and document frequency to be used by
* SVDLIBC
* to create a LSA model.
*
* The input is a file where each line is tokenized document
* (tokens are separated by spaces), the first token is the document name.
*
* The output is stored in the specified directory matrix in
*
* sparse binary format
* The file created follow the following name convetion:
*
*
* X-matrix: term-by-document matrix
* X-matrix-tf-idf: term-by-document matrix rescaled with tf-idf
* X-col: col index
* X-row: row index
* X-df: document frequency
*
*
* @version %I%, %G%
* @author Claudio Giuliano
* @since 1.0
*/
public abstract class TermDocumentMatrixBuilder {
/**
* Define a static logger variable so that it references the
* Logger instance named TermDocumentMatrixBuilder
.
*/
static Logger logger = Logger.getLogger(TermDocumentMatrixBuilder.class.getName());
/**
* The term index
*/
protected Index termIndex;
//
protected static Pattern spacePattern = Pattern.compile(" ");
/**
* The document index
*/
protected Index documentIndex;
/**
* The matrix writer.
*/
protected MatrixFileWriter matrixWriter;
//
protected int columnCount;
//
protected Vocabulary corpusVocabulary;
//
protected TermSet stopwordSet, keywordSet;
//
protected int totalKW;
//
protected int[] lengthFreq;
//
protected File matrixFile, rowFile, colFile, dfFile;
/**
* Constructs a reader.
*/
public TermDocumentMatrixBuilder(String matrixName, File stopwordFile, File keywordFile) throws IOException {
totalKW = 0;
keywordSet = new TermSet();
keywordSet.read(new FileReader(keywordFile));
logger.info("keyword to be indexed: " + keywordSet.size());
stopwordSet = new TermSet();
stopwordSet.read(new FileReader(stopwordFile));
logger.info(stopwordFile + "(" + stopwordSet.size() + ")");
lengthFreq = new int[101];
columnCount = 0;
matrixFile = new File(matrixName + "-matrix");
rowFile = new File(matrixName + "-row");
colFile = new File(matrixName + "-col");
dfFile = new File(matrixName + "-df");
termIndex = new Index();
documentIndex = new Index();
matrixWriter = new SparseBinaryMatrixFileWriter(matrixFile);
corpusVocabulary = new Vocabulary();
} // end constructor
/**
* Closes the readers.
*/
public void close() throws IOException {
//
termIndex.write(new FileWriter(rowFile));
//
documentIndex.write(new FileWriter(colFile));
//
matrixWriter.close();
//
corpusVocabulary.write(new FileWriter(dfFile));
} // end close
//
public abstract void read(File root) throws IOException;
//
protected void addDocument(String[] array) throws IOException {
//logger.debug("readSentence");
Vocabulary documentVocabulary = new Vocabulary();
totalKW += (array.length - 1);
String token = null;
String[] t = null;
for (int i = 1; i < array.length; i++) {
token = array[i].toLowerCase();
if (isWord(token)) {
if (keywordSet.size() == 0) {
if (stopwordSet.size() == 0) {
logger.debug("1 adding " + token);
documentVocabulary.add(token);
} else if (!stopwordSet.contains(token)) {
logger.debug("2 adding " + token);
documentVocabulary.add(token);
}
} else if (keywordSet.contains(token)) {
logger.debug("3 adding " + token);
documentVocabulary.add(token);
} // end inner if
} // end if isWord
} // end for i
if (array.length <= 100) {
lengthFreq[array.length - 1]++;
} else {
lengthFreq[0]++;
}
if (documentVocabulary.size() == 0) {
return;
}
int documentID = documentIndex.get(array[0]);
///System.out.print(documentID + " \"" + sent + "\"\n");
int size = documentVocabulary.entrySet().size();
int[] indexes = new int[size];
float[] values = new float[size];
int j = 0;
// iterates over the types
//logger.debug("iterates over the types");
Iterator it = documentVocabulary.entrySet().iterator();
while (it.hasNext()) {
Map.Entry me = (Map.Entry) it.next();
String term = (String) me.getKey();
Vocabulary.TermFrequency tf = (Vocabulary.TermFrequency) me.getValue();
int ti = termIndex.add(term);
indexes[j] = ti;
values[j] = (float) (1 + Math.log(tf.get()));
corpusVocabulary.add(term);
j++;
} // end while
columnCount++;
matrixWriter.writeColumn(indexes, values);
} // end addDocument
//
//
private boolean isWord(String s) {
if (s.length() < 2) {
return false;
}
int ch = (int) s.charAt(0);
if (!Character.isLetter(ch)) {
//logger.info((int) ch + " isWord '" + s + "' false");
return false;
}
for (int i = 1; i < s.length(); i++) {
ch = (int) s.charAt(i);
if (!Character.isLetterOrDigit(ch) || ch == '-') {
//logger.info((int) ch + " isWord '" + s + "' false");
return false;
}
}
//logger.info("\tisWord '" + s + "' true");
return true;
} // end isWord
} // end TermDocumentMatrixBuilder
© 2015 - 2025 Weber Informatics LLC | Privacy Policy