org.archive.util.binsearch.FileSearchTool Maven / Gradle / Ivy
package org.archive.util.binsearch;
import java.io.File;
import java.net.URI;
import java.util.Comparator;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.archive.util.binsearch.impl.HDFSSeekableLineReaderFactory;
import org.archive.util.binsearch.impl.HTTPSeekableLineReaderFactory;
import org.archive.util.binsearch.impl.RandomAccessFileSeekableLineReaderFactory;
import org.archive.util.iterator.SortedCompositeIterator;
public class FileSearchTool implements Tool {
private final static Logger LOGGER =
Logger.getLogger(FileSearchTool.class.getName());
private Configuration conf;
public final static String TOOL_NAME = "bin-search";
public static final String TOOL_DESCRIPTION =
"A tool for performing binary searches through sorted text files, either local, or in HDFS";
private static int USAGE(int code) {
System.err.println("USAGE: " + TOOL_NAME + " [OPTIONS] PREFIX PATH ...");
System.err.println("\tOPTIONS can be one of:");
System.err.println("\t\t--less-than return records starting at the largest record smaller than PREFIX");
System.err.println();
System.err.println("PREFIX is the key to search for");
System.err.println("PATH can be a path to a local file, or a path in an HDFS filesystem");
System.err.println("if multiple PATHs are presented, they are all searched, and the matching records are merged together.");
return code;
}
public int run(String[] args) throws Exception {
if(args.length < 2) {
return USAGE(1);
}
int arg = 0;
boolean lessThan = false;
while(arg < args.length){
if(args[arg].equals("--less-than")) {
lessThan = true;
arg++;
} else if(args[arg].equals("-v")) {
Logger.getLogger(SortedTextFile.class.getName()).setLevel(Level.FINE);
arg++;
} else {
break;
}
}
String key = args[arg++];
if(arg == args.length){
return USAGE(1);
}
Comparator comparator = new Comparator() {
public int compare(String s1, String s2) {
return s1.compareTo(s2);
}
};
SortedCompositeIterator mergeItr =
new SortedCompositeIterator(comparator);
for(int i = arg; i < args.length; i++) {
String spec = args[i];
SeekableLineReaderFactory factory;
if(spec.startsWith("hdfs://")) {
Configuration conf = new Configuration();
FileSystem fs = FileSystem.get(new URI(spec),conf);
Path path = new Path(spec);
factory = new HDFSSeekableLineReaderFactory(fs, path);
LOGGER.warning("Added HDFS file: " + spec);
} else if(spec.startsWith("http://")) {
factory = HTTPSeekableLineReaderFactory.getHttpFactory(spec);
LOGGER.warning("Added HTTP file: " + spec);
} else {
File file = new File(spec);
factory = new RandomAccessFileSeekableLineReaderFactory(file);
LOGGER.warning("Added file: " + spec);
}
SortedTextFile stf = new SortedTextFile(factory);
mergeItr.addIterator(stf.getRecordIterator(key, lessThan));
}
while(mergeItr.hasNext()) {
System.out.println(mergeItr.next());
}
System.out.flush();
mergeItr.close();
return 0;
}
public static void main(String[] args) throws Exception {
int res = ToolRunner.run(new Configuration(), new FileSearchTool(), args);
System.exit(res);
}
public void setConf(Configuration conf) {
this.conf = conf;
}
public Configuration getConf() {
return conf;
}
}