io.anserini.collection.DocumentCollection Maven / Gradle / Ivy
/*
* Anserini: A Lucene toolkit for reproducible information retrieval research
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package io.anserini.collection;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import java.io.BufferedReader;
import java.io.IOException;
import java.nio.file.FileVisitOption;
import java.nio.file.FileVisitResult;
import java.nio.file.FileVisitor;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.SimpleFileVisitor;
import java.nio.file.attribute.BasicFileAttributes;
import java.util.ArrayList;
import java.util.EnumSet;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.Set;
import java.util.stream.Collectors;
/**
* A static collection of documents, comprised of one or more {@link FileSegment}s.
* Each {@link FileSegment} is a container for {@link SourceDocument}s.
* A collection is assumed to be a directory. In the case where the collection is
* a single file (e.g., a Wikipedia dump), place the file into an arbitrary directory.
*
* The collection is responsible for discovering files with qualified names in the input
* directory. The file segment implementation is responsible for reading each file to generate
* {@link SourceDocument}s for indexing.
*
* The steps of adding a new collection class are:
*
*
*
* - Create a subclass for {@link DocumentCollection}.
*
* - Implement class {@link FileSegment}, by convention as an inner class of the
* {@code DocumentCollection}. See {@link TrecCollection.Segment} as an example.
*
* - Create a subclass for {@link SourceDocument} implementing the corresponding document type.
* See {@link TrecCollection.Document} as an example.
*
* - Optionally create a new {@link io.anserini.index.generator.LuceneDocumentGenerator}.
* The {@link io.anserini.index.generator.LuceneDocumentGenerator#createDocument}
* method takes {@code SourceDocument} as the input and return a Lucene
* {@link org.apache.lucene.document.Document} for indexing.
*
* - Remember to add unit tests at {@code src/test/java/io/anserini/collection}!
*
*
*/
public abstract class DocumentCollection implements Iterable> {
private static final Logger LOG = LogManager.getLogger(DocumentCollection.class);
protected Path path;
protected Set skippedFilePrefix = new HashSet<>();
protected Set allowedFilePrefix = new HashSet<>();
protected Set skippedFileSuffix = new HashSet<>();
protected Set allowedFileSuffix = new HashSet<>();
protected Set skippedDir = new HashSet<>();
/**
* Returns the path of the collection.
*
* @return path of the collection
*/
public final Path getCollectionPath() {
return path;
}
/**
* Creates a {@code FileSegment} from a path.
*
* @param p path
* @return {@code FileSegment} with the specified path
* @throws IOException if file access error encountered
*/
public abstract FileSegment createFileSegment(Path p) throws IOException;
/**
* Creates a {@code FileSegment} from a path.
*
* @param bufferedReader raw BufferedReader
* @return {@code FileSegment} with the specified path
* @throws IOException if file access error encountered
*/
public abstract FileSegment createFileSegment(BufferedReader bufferedReader) throws IOException;
/**
* An iterator over {@code FileSegment} for the {@code DocumentCollection} iterable.
* A collection is comprised of one or more file segments.
*/
@Override
public final Iterator> iterator() {
List paths = discover(this.path);
Iterator pathsIterator = paths.iterator();
return new Iterator<>() {
Path segmentPath;
FileSegment segment;
@Override
public boolean hasNext() {
if (segment != null) {
return true;
}
if (!pathsIterator.hasNext()) {
return false;
} else {
try {
segmentPath = pathsIterator.next();
segment = createFileSegment(segmentPath);
} catch (IOException e) {
return false;
}
}
return true;
}
@Override
public FileSegment next() throws NoSuchElementException {
if (!hasNext()) {
throw new NoSuchElementException("No more file segments to read.");
} else {
FileSegment seg = segment;
segment = null;
return seg;
}
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
};
}
/**
* Returns the paths in the collection.
*
* @return paths in the collection
*/
public List getSegmentPaths() {
return discover(this.path);
}
/**
* Returns the paths in the collection, taking into account sharding.
*
* @param currShard the current shard
* @param shardCount the total number of shards
* @return file segments in current shard
*/
public List getSegmentPaths(int shardCount, int currShard) {
List segments = discover(this.path);
return segments.stream().filter(x -> x.toString().hashCode() % shardCount == currShard).collect(Collectors.toList());
}
// Private method for walking a path.
private List discover(Path p) {
final List paths = new ArrayList<>();
FileVisitor fv = new SimpleFileVisitor<>() {
@Override
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
Path name = file.getFileName();
boolean shouldAdd = true;
if (name != null) {
String fileName = name.toString();
for (String s : skippedFileSuffix) {
if (fileName.endsWith(s)) {
shouldAdd = false;
break;
}
}
if (shouldAdd && !allowedFileSuffix.isEmpty()) {
shouldAdd = false;
for (String s : allowedFileSuffix) {
if (fileName.endsWith(s)) {
shouldAdd = true;
break;
}
}
}
if (shouldAdd) {
for (String s : skippedFilePrefix) {
if (fileName.startsWith(s)) {
shouldAdd = false;
break;
}
}
}
if (shouldAdd && !allowedFilePrefix.isEmpty()) {
shouldAdd = false;
for (String s : allowedFilePrefix) {
if (fileName.startsWith(s)) {
shouldAdd = true;
break;
}
}
}
}
if (shouldAdd) {
paths.add(file);
}
return FileVisitResult.CONTINUE;
}
@Override
public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs) {
if (skippedDir.contains(dir.getFileName().toString())) {
LOG.info("Skipping: " + dir);
return FileVisitResult.SKIP_SUBTREE;
}
return FileVisitResult.CONTINUE;
}
@Override
public FileVisitResult visitFileFailed(Path file, IOException ioe) {
LOG.error("Visiting failed for " + file.toString(), ioe);
return FileVisitResult.SKIP_SUBTREE;
}
};
try {
Files.walkFileTree(p, EnumSet.of(FileVisitOption.FOLLOW_LINKS), Integer.MAX_VALUE, fv);
} catch (IOException e) {
LOG.error("IOException during file visiting", e);
}
return paths;
}
}
© 2015 - 2025 Weber Informatics LLC | Privacy Policy