com.actelion.research.mapReduceGeneric.examples.WordCount Maven / Gradle / Ivy
Show all versions of map-reduce-generic Show documentation
/*
* Orbit, a versatile image analysis software for biological image-based quantification.
* Copyright (C) 2009 - 2016 Actelion Pharmaceuticals Ltd., Gewerbestrasse 16, CH-4123 Allschwil, Switzerland.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see .
*
*/
package com.actelion.research.mapReduceGeneric.examples;
import com.actelion.research.mapReduceGeneric.IMapReduce;
import com.actelion.research.mapReduceGeneric.executors.IMapReduceExecutor;
import com.actelion.research.mapReduceGeneric.executors.MapReduceExecutorLocalMultiCore;
import com.actelion.research.mapReduceGeneric.utils.Helpers;
import com.actelion.research.mapReduceGeneric.utils.KeyValue;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.*;
/**
* Wordcount demo (it seems that every map reduce framework must have s.th. like this).
* It reads numURLs random wikipedia pages and outputs the 20 most frequent words.
*
* Please don't run this too ofter with numURLs set to a high value to not block Wikipedia... !
*/
public class WordCount implements IMapReduce {
private static int numURLs = 100;
private String[] stopWords = new String[]{"this", "not", "or", "do", "does", "you", "with", "from", "this", "was", "were", "for"};
public List> map(String element) {
List> wordList = new ArrayList>();
try {
String content = getRedirectedContentStr(new URL(element));
StringTokenizer tokenizer = new StringTokenizer(content, " ");
while (tokenizer.hasMoreTokens()) {
String word = tokenizer.nextToken().trim();
if (accept(word)) {
//System.out.println("word: "+word);
wordList.add(new KeyValue(word, 1));
}
}
} catch (Exception e) {
e.printStackTrace();
throw new RuntimeException("WordCount error: " + e.getMessage()); // important: throw a RuntimeException here so that the mapReduce framework reschedules this job
}
return wordList;
}
public Integer reduce(String key, List valueList) {
int cnt = 0;
for (Integer v : valueList) {
cnt += v;
}
return cnt;
}
public Collection parseParams(String s) {
return Helpers.parseParamsString(s);
}
public String serializeParam(String element) {
return element;
}
private boolean accept(String s) {
if (s == null) return false;
if (s.length() < 5) return false;
if (s.contains("<") || s.contains(">") || s.contains("\"") || s.contains(":") || s.contains("=") || s.contains(",") || s.contains(";") || s.contains(".") || s.contains("/") || s.contains("\\") || s.contains("(") || s.contains(")"))
return false;
for (String stop : stopWords) {
if (s.equalsIgnoreCase(stop)) return false;
}
return true;
}
public String getRedirectedContentStr(URL url) {
StringBuilder sb = new StringBuilder();
BufferedReader in = null;
try {
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
String newUrl = conn.getHeaderField("Location"); // the random page will redirect us...
conn.disconnect();
URL url2 = newUrl != null ? new URL(newUrl) : url;
in = new BufferedReader(
new InputStreamReader(url2.openStream()));
String inputLine;
while ((inputLine = in.readLine()) != null) {
sb.append(inputLine + "\n");
}
in.close();
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if (in != null) in.close();
} catch (Exception e) {
}
}
return sb.toString();
}
public static void main(String[] args) throws Exception {
List urlList = new ArrayList(numURLs);
for (int i = 0; i < numURLs; i++) {
urlList.add("http://en.wikipedia.org/wiki/Special:Random");
}
//IMapReduceExecutor executor = new MapReduceExecutorLocal();
IMapReduceExecutor executor = new MapReduceExecutorLocalMultiCore();
Map wordCountMap = executor.execute(urlList, new WordCount());
// output most frequent words
List> wordCountList = new ArrayList>(wordCountMap.size());
for (String s : wordCountMap.keySet()) {
wordCountList.add(new KeyValue(s, wordCountMap.get(s)));
}
Collections.sort(wordCountList, new Comparator>() {
public int compare(KeyValue o1, KeyValue o2) {
return o2.getValue().compareTo(o1.getValue());
}
});
System.out.println("Most frequent words:");
for (int i = 0; i < 20; i++) {
KeyValue kv = wordCountList.get(i);
System.out.println(kv.getKey() + ": " + kv.getValue());
}
}
}