All Downloads are FREE. Search and download functionalities are using the official Maven repository.

fr.univnantes.termsuite.model.FileSystemCorpus Maven / Gradle / Ivy

package fr.univnantes.termsuite.model;

import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.file.FileSystems;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.PathMatcher;
import java.nio.file.Paths;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.function.Function;
import java.util.stream.Stream;

import com.google.common.base.Charsets;
import com.google.common.base.Preconditions;

import fr.univnantes.termsuite.api.TermSuiteException;
import fr.univnantes.termsuite.uima.readers.StringPreparator;

public abstract class FileSystemCorpus {

	private Lang lang;
	private Path rootDirectory;
	private Charset encoding = Charsets.UTF_8;
	private String pattern;
	private String extension;

	public FileSystemCorpus(Lang lang, Path rootDirectory, String pattern, String extension) {
		super();
		this.rootDirectory = rootDirectory;
		this.lang = lang;
		this.pattern = pattern;
		this.extension = extension;
	}

	public Lang getLang() {
		return lang;
	}

	public void setLang(Lang lang) {
		this.lang = lang;
	}

	public Path getRootDirectory() {
		return rootDirectory;
	}

	public void setRootDirectory(Path rootDirectory) {
		this.rootDirectory = rootDirectory;
	}

	public Charset getEncoding() {
		return encoding;
	}

	public void setEncoding(Charset encoding) {
		this.encoding = encoding;
	}
	
	public String getPattern() {
		return pattern;
	}
	
	public void setPattern(String pattern) {
		this.pattern = pattern;
	}

	public String getExtension() {
		return this.extension;
	}
	
	public void setExtension(String extension) {
		this.extension = extension;
	}
	
	private  Path check(Path directoryPath) {
		Preconditions.checkArgument(directoryPath.toFile().exists(),
				"Directory %s does not exist", directoryPath);
		Preconditions.checkArgument(directoryPath.toFile().isDirectory(),
				"Not a directory: %s", directoryPath);
		return directoryPath;
	}
	
	protected  Stream pathWalker(Path directory, String pattern,
			Function pathMapper) {
		String glob = String.format("glob:%s", pattern);
		
		Path directoryPath = check(directory);
		
		final PathMatcher pathMatcher = FileSystems.getDefault().getPathMatcher(
				glob);
		
		try {
			return Files.walk(directoryPath).filter(path -> {
				return pathMatcher.matches(path) && path.toFile().isFile();
			}).map(pathMapper);
			
		} catch (IOException e) {
			throw new TermSuiteException(e);
		}
	}

	public String readFileContent(Document doc) {
		try {
			return com.google.common.io.Files.toString(Paths.get(doc.getUrl()).toFile(), encoding);
		} catch (IOException e) {
			throw new TermSuiteException(
					"Could not read file content for document " + doc.getUrl(), 
					e);
		}
	}
	
	@Override
	public String toString() {
		return String.format("%s[%s]", this.getClass().getSimpleName(), this.rootDirectory);
	}
	

	public int getNbDocuments() {
		return (int)documents().count();
	}

	public long getTotalSize() {
		final AtomicLong size = new AtomicLong(0);
		documents().forEach(doc -> {
			size.addAndGet(Paths.get(doc.getUrl()).toFile().length());
		});
		return size.longValue();
	}

	public Stream documents() {
		AtomicInteger documentIndex = new AtomicInteger(0);
		Stream pathWalker = pathWalker(
				getRootDirectory(), 
				getPattern(), 
				path -> {
					Document document = new Document(getLang(),  path.toString());
					document.setSize(path.toFile().length());
					return document;	
				});
		return pathWalker;
	}

	public String readDocumentText(Document doc) {
		return cleanRawText(readFileContent(doc));
	}
	
	public String cleanRawText(String rawText) {
		StringPreparator stringPreparator = new StringPreparator();
		return stringPreparator.prepare(rawText);
	}

}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy