All Downloads are FREE. Search and download functionalities are using the official Maven repository.

io.anserini.collection.DocumentCollection Maven / Gradle / Ivy

/*
 * Anserini: A Lucene toolkit for reproducible information retrieval research
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package io.anserini.collection;

import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import java.io.BufferedReader;
import java.io.IOException;
import java.nio.file.FileVisitOption;
import java.nio.file.FileVisitResult;
import java.nio.file.FileVisitor;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.SimpleFileVisitor;
import java.nio.file.attribute.BasicFileAttributes;
import java.util.ArrayList;
import java.util.EnumSet;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;
import java.util.Set;
import java.util.stream.Collectors;

/**
 * 

A static collection of documents, comprised of one or more {@link FileSegment}s. * Each {@link FileSegment} is a container for {@link SourceDocument}s. * A collection is assumed to be a directory. In the case where the collection is * a single file (e.g., a Wikipedia dump), place the file into an arbitrary directory.

* *

The collection is responsible for discovering files with qualified names in the input * directory. The file segment implementation is responsible for reading each file to generate * {@link SourceDocument}s for indexing.

* *

The steps of adding a new collection class are:

* *
    * *
  1. Create a subclass for {@link DocumentCollection}.
  2. * *
  3. Implement class {@link FileSegment}, by convention as an inner class of the * {@code DocumentCollection}. See {@link TrecCollection.Segment} as an example.
  4. * *
  5. Create a subclass for {@link SourceDocument} implementing the corresponding document type. * See {@link TrecCollection.Document} as an example.
  6. * *
  7. Optionally create a new {@link io.anserini.index.generator.LuceneDocumentGenerator}. * The {@link io.anserini.index.generator.LuceneDocumentGenerator#createDocument} * method takes {@code SourceDocument} as the input and return a Lucene * {@link org.apache.lucene.document.Document} for indexing.
  8. * *
  9. Remember to add unit tests at {@code src/test/java/io/anserini/collection}!
  10. * *
*/ public abstract class DocumentCollection implements Iterable> { private static final Logger LOG = LogManager.getLogger(DocumentCollection.class); protected Path path; protected Set skippedFilePrefix = new HashSet<>(); protected Set allowedFilePrefix = new HashSet<>(); protected Set skippedFileSuffix = new HashSet<>(); protected Set allowedFileSuffix = new HashSet<>(); protected Set skippedDir = new HashSet<>(); /** * Returns the path of the collection. * * @return path of the collection */ public final Path getCollectionPath() { return path; } /** * Creates a {@code FileSegment} from a path. * * @param p path * @return {@code FileSegment} with the specified path * @throws IOException if file access error encountered */ public abstract FileSegment createFileSegment(Path p) throws IOException; /** * Creates a {@code FileSegment} from a path. * * @param bufferedReader raw BufferedReader * @return {@code FileSegment} with the specified path * @throws IOException if file access error encountered */ public abstract FileSegment createFileSegment(BufferedReader bufferedReader) throws IOException; /** * An iterator over {@code FileSegment} for the {@code DocumentCollection} iterable. * A collection is comprised of one or more file segments. */ @Override public final Iterator> iterator() { List paths = discover(this.path); Iterator pathsIterator = paths.iterator(); return new Iterator<>() { Path segmentPath; FileSegment segment; @Override public boolean hasNext() { if (segment != null) { return true; } if (!pathsIterator.hasNext()) { return false; } else { try { segmentPath = pathsIterator.next(); segment = createFileSegment(segmentPath); } catch (IOException e) { return false; } } return true; } @Override public FileSegment next() throws NoSuchElementException { if (!hasNext()) { throw new NoSuchElementException("No more file segments to read."); } else { FileSegment seg = segment; segment = null; return seg; } } @Override public void remove() { throw new UnsupportedOperationException(); } }; } /** * Returns the paths in the collection. * * @return paths in the collection */ public List getSegmentPaths() { return discover(this.path); } /** * Returns the paths in the collection, taking into account sharding. * * @param currShard the current shard * @param shardCount the total number of shards * @return file segments in current shard */ public List getSegmentPaths(int shardCount, int currShard) { List segments = discover(this.path); return segments.stream().filter(x -> x.toString().hashCode() % shardCount == currShard).collect(Collectors.toList()); } // Private method for walking a path. private List discover(Path p) { final List paths = new ArrayList<>(); FileVisitor fv = new SimpleFileVisitor<>() { @Override public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { Path name = file.getFileName(); boolean shouldAdd = true; if (name != null) { String fileName = name.toString(); for (String s : skippedFileSuffix) { if (fileName.endsWith(s)) { shouldAdd = false; break; } } if (shouldAdd && !allowedFileSuffix.isEmpty()) { shouldAdd = false; for (String s : allowedFileSuffix) { if (fileName.endsWith(s)) { shouldAdd = true; break; } } } if (shouldAdd) { for (String s : skippedFilePrefix) { if (fileName.startsWith(s)) { shouldAdd = false; break; } } } if (shouldAdd && !allowedFilePrefix.isEmpty()) { shouldAdd = false; for (String s : allowedFilePrefix) { if (fileName.startsWith(s)) { shouldAdd = true; break; } } } } if (shouldAdd) { paths.add(file); } return FileVisitResult.CONTINUE; } @Override public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs) { if (skippedDir.contains(dir.getFileName().toString())) { LOG.info("Skipping: " + dir); return FileVisitResult.SKIP_SUBTREE; } return FileVisitResult.CONTINUE; } @Override public FileVisitResult visitFileFailed(Path file, IOException ioe) { LOG.error("Visiting failed for " + file.toString(), ioe); return FileVisitResult.SKIP_SUBTREE; } }; try { Files.walkFileTree(p, EnumSet.of(FileVisitOption.FOLLOW_LINKS), Integer.MAX_VALUE, fv); } catch (IOException e) { LOG.error("IOException during file visiting", e); } return paths; } }




© 2015 - 2025 Weber Informatics LLC | Privacy Policy