fr.univnantes.termsuite.model.FileSystemCorpus Maven / Gradle / Ivy
package fr.univnantes.termsuite.model;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.file.FileSystems;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.PathMatcher;
import java.nio.file.Paths;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.function.Function;
import java.util.stream.Stream;
import com.google.common.base.Charsets;
import com.google.common.base.Preconditions;
import fr.univnantes.termsuite.api.TermSuiteException;
import fr.univnantes.termsuite.uima.readers.StringPreparator;
public abstract class FileSystemCorpus {
private Lang lang;
private Path rootDirectory;
private Charset encoding = Charsets.UTF_8;
private String pattern;
private String extension;
public FileSystemCorpus(Lang lang, Path rootDirectory, String pattern, String extension) {
super();
this.rootDirectory = rootDirectory;
this.lang = lang;
this.pattern = pattern;
this.extension = extension;
}
public Lang getLang() {
return lang;
}
public void setLang(Lang lang) {
this.lang = lang;
}
public Path getRootDirectory() {
return rootDirectory;
}
public void setRootDirectory(Path rootDirectory) {
this.rootDirectory = rootDirectory;
}
public Charset getEncoding() {
return encoding;
}
public void setEncoding(Charset encoding) {
this.encoding = encoding;
}
public String getPattern() {
return pattern;
}
public void setPattern(String pattern) {
this.pattern = pattern;
}
public String getExtension() {
return this.extension;
}
public void setExtension(String extension) {
this.extension = extension;
}
private Path check(Path directoryPath) {
Preconditions.checkArgument(directoryPath.toFile().exists(),
"Directory %s does not exist", directoryPath);
Preconditions.checkArgument(directoryPath.toFile().isDirectory(),
"Not a directory: %s", directoryPath);
return directoryPath;
}
protected Stream pathWalker(Path directory, String pattern,
Function pathMapper) {
String glob = String.format("glob:%s", pattern);
Path directoryPath = check(directory);
final PathMatcher pathMatcher = FileSystems.getDefault().getPathMatcher(
glob);
try {
return Files.walk(directoryPath).filter(path -> {
return pathMatcher.matches(path) && path.toFile().isFile();
}).map(pathMapper);
} catch (IOException e) {
throw new TermSuiteException(e);
}
}
public String readFileContent(Document doc) {
try {
return com.google.common.io.Files.toString(Paths.get(doc.getUrl()).toFile(), encoding);
} catch (IOException e) {
throw new TermSuiteException(
"Could not read file content for document " + doc.getUrl(),
e);
}
}
@Override
public String toString() {
return String.format("%s[%s]", this.getClass().getSimpleName(), this.rootDirectory);
}
public int getNbDocuments() {
return (int)documents().count();
}
public long getTotalSize() {
final AtomicLong size = new AtomicLong(0);
documents().forEach(doc -> {
size.addAndGet(Paths.get(doc.getUrl()).toFile().length());
});
return size.longValue();
}
public Stream documents() {
AtomicInteger documentIndex = new AtomicInteger(0);
Stream pathWalker = pathWalker(
getRootDirectory(),
getPattern(),
path -> {
Document document = new Document(getLang(), path.toString());
document.setSize(path.toFile().length());
return document;
});
return pathWalker;
}
public String readDocumentText(Document doc) {
return cleanRawText(readFileContent(doc));
}
public String cleanRawText(String rawText) {
StringPreparator stringPreparator = new StringPreparator();
return stringPreparator.prepare(rawText);
}
}