All Downloads are FREE. Search and download functionalities are using the official Maven repository.

org.lumongo.example.commoncrawl.IndexCommonCrawl Maven / Gradle / Ivy

There is a newer version: 0.52
Show newest version
package org.lumongo.example.commoncrawl;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.Scanner;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;

import org.apache.log4j.Logger;
import org.jets3t.service.S3ServiceException;
import org.jets3t.service.ServiceException;
import org.jets3t.service.impl.rest.httpclient.RestS3Service;
import org.jets3t.service.model.S3Object;
import org.jets3t.service.security.AWSCredentials;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.jwat.arc.ArcReader;
import org.jwat.arc.ArcReaderFactory;
import org.jwat.arc.ArcRecord;
import org.jwat.common.Payload;
import org.lumongo.client.command.CreateOrUpdateIndex;
import org.lumongo.client.command.Store;
import org.lumongo.client.config.IndexConfig;
import org.lumongo.client.config.LumongoPoolConfig;
import org.lumongo.client.pool.LumongoWorkPool;
import org.lumongo.cluster.message.Lumongo.LMAnalyzer;
import org.lumongo.doc.ResultDocBuilder;
import org.lumongo.fields.FieldConfigBuilder;
import org.lumongo.util.LogUtil;
import org.lumongo.util.properties.PropertiesReader;

import com.mongodb.BasicDBObject;
import com.mongodb.DBObject;

public class IndexCommonCrawl {
	//fields
	private static final String UID = "uid";
	private static final String URL = "url";
	private static final String CONTENTS = "contents";
	private static final String TEXT_CONTENTS = "textContents";
	private static final String TITLE = "title";
	
	private final static Logger log = Logger.getLogger(IndexCommonCrawl.class);
	
	private static final AtomicLong count = new AtomicLong();
	private static LumongoWorkPool lumongoWorkPool;
	
	public static void main(String[] args) throws Exception {
		
		if (args.length != 4) {
			System.err.println("usage: awsPropertiesFile prefix lumongoServers indexName");
			System.err.println("usage: aws.properties 2010/09/25/9 10.0.0.1,10.0.0.2 ccrawl");
			System.exit(1);
		}
		
		LogUtil.loadLogConfig();
		
		String propFileName = args[0];
		String prefix = args[1];
		final String[] serverNames = args[2].split(",");
		final String indexName = args[3];
		
		final LumongoPoolConfig clientConfig = new LumongoPoolConfig();
		for (String serverName : serverNames) {
			clientConfig.addMember(serverName);
		}
		
		File propFile = new File(propFileName);
		
		PropertiesReader pr = new PropertiesReader(propFile);
		
		String awsAccessKey = pr.getString("awsAccessKey");
		String awsSecretKey = pr.getString("awsSecretKey");
		
		final AWSCredentials awsCredentials = new AWSCredentials(awsAccessKey, awsSecretKey);
		
		RestS3Service s3Service = new RestS3Service(awsCredentials);
		s3Service.setRequesterPaysEnabled(true);
		
		System.out.println("Fetching files list for prefix <" + prefix + ">");
		System.out.println("This can take awhile ...");
		
		S3Object[] objects = s3Service.listObjects("aws-publicdatasets", "common-crawl/crawl-002/" + prefix, null);
		System.out.println("Fetched info for <" + objects.length + "> files");
		
		lumongoWorkPool = new LumongoWorkPool(clientConfig);
		
		IndexConfig indexConfig = new IndexConfig(CONTENTS);
		indexConfig.addFieldConfig(FieldConfigBuilder.create(URL).indexAs(LMAnalyzer.LC_KEYWORD));
		indexConfig.addFieldConfig(FieldConfigBuilder.create(TEXT_CONTENTS).indexAs(LMAnalyzer.STANDARD));
		indexConfig.addFieldConfig(FieldConfigBuilder.create(TITLE).indexAs(LMAnalyzer.STANDARD));
		
		CreateOrUpdateIndex createOrUpdateIndex = new CreateOrUpdateIndex(indexName, 16, UID, indexConfig);
		
		lumongoWorkPool.createOrUpdateIndex(createOrUpdateIndex);
		
		ExecutorService pool = Executors.newFixedThreadPool(16);
		
		for (S3Object object : objects) {
			final String key = object.getKey();
			
			pool.execute(new Runnable() {
				@Override
				public void run() {
					try {
						handleFile(indexName, awsCredentials, key);
					}
					catch (Exception e) {
						log.error(e.getClass().getSimpleName() + ": ", e);
					}
				}
			});
			
		}
		
		pool.shutdown();
		lumongoWorkPool.shutdown();
		
		while (!pool.isTerminated()) {
			pool.awaitTermination(1, TimeUnit.MINUTES);
		}
		
	}
	
	private static void handleFile(String indexName, AWSCredentials awsCredentials, String key) throws S3ServiceException, IOException, ServiceException {
		
		ArcReader ar = null;
		
		try {
			
			RestS3Service s3Service = new RestS3Service(awsCredentials);
			s3Service.setRequesterPaysEnabled(true);
			
			S3Object object = s3Service.getObject("aws-publicdatasets", key);
			ar = ArcReaderFactory.getReader(object.getDataInputStream(), 1024 * 16);
			
			log.info("Opened <" + key + ">");
			
			ar.getVersionBlock();
			ArcRecord arcRecord = null;
			while ((arcRecord = ar.getNextRecord()) != null) {
				try {
					String uniqueId = arcRecord.getUrl().toString();
					
					String url = null;
					if (arcRecord.getUrl() != null) {
						url = arcRecord.getUrl().toString();
					}
					String contentType = arcRecord.getContentType();
					
					if ("text/html".equals(contentType)) {
						Payload p = arcRecord.getPayload();
						byte[] bytes = getBytes(p.getInputStream());
						
						Store s = new Store(uniqueId, indexName);
						
						try (Scanner scanner = new Scanner(new ByteArrayInputStream(bytes))) {
							String content = scanner.useDelimiter("\\A").next();
							
							Document d = Jsoup.parse(content);
							
							String pageText = d.text();
							
							String title = null;
							
							try {
								Elements e = d.head().getElementsByTag(TITLE);
								if (!e.isEmpty()) {
									title = e.get(0).text();
								}
							}
							catch (Exception e) {
								
							}
							
							if (url != null) {
								
								DBObject document = new BasicDBObject();
								document.put(CONTENTS, bytes);
								document.put(TEXT_CONTENTS, pageText);
								document.put(TITLE, title);
								document.put(URL, url);
								
								ResultDocBuilder rdBuilder = new ResultDocBuilder().setDocument(document);
								s.setResultDocument(rdBuilder);
								lumongoWorkPool.store(s);
							}
						}
						
						long c = count.getAndIncrement();
						if (c % 5000 == 0) {
							log.info("Indexed <" + c + ">");
						}
					}
					
				}
				catch (Exception e) {
					log.warn(e.getClass().getSimpleName() + ": " + e);
				}
			}
		}
		finally {
			if (ar != null) {
				log.info("Closed <" + key + ">");
				ar.close();
			}
		}
		
	}
	
	protected static byte[] getBytes(InputStream is) throws IOException {
		try {
			ByteArrayOutputStream buffer = new ByteArrayOutputStream();
			
			int nRead;
			byte[] data = new byte[1024 * 16];
			
			while ((nRead = is.read(data, 0, data.length)) != -1) {
				buffer.write(data, 0, nRead);
			}
			
			buffer.flush();
			
			return buffer.toByteArray();
		}
		finally {
			if (is != null) {
				is.close();
			}
		}
	}
	
}




© 2015 - 2024 Weber Informatics LLC | Privacy Policy