All Downloads are FREE. Search and download functionalities are using the official Maven repository.

com.actelion.research.mapReduceGeneric.examples.WordCount Maven / Gradle / Ivy

There is a newer version: 1.0.10
Show newest version
/*
 *     Orbit, a versatile image analysis software for biological image-based quantification.
 *     Copyright (C) 2009 - 2016 Actelion Pharmaceuticals Ltd., Gewerbestrasse 16, CH-4123 Allschwil, Switzerland.
 *
 *     This program is free software: you can redistribute it and/or modify
 *     it under the terms of the GNU General Public License as published by
 *     the Free Software Foundation, either version 3 of the License, or
 *     (at your option) any later version.
 *
 *     This program is distributed in the hope that it will be useful,
 *     but WITHOUT ANY WARRANTY; without even the implied warranty of
 *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *     GNU General Public License for more details.
 *
 *     You should have received a copy of the GNU General Public License
 *     along with this program.  If not, see .
 *
 */

package com.actelion.research.mapReduceGeneric.examples;

import com.actelion.research.mapReduceGeneric.IMapReduce;
import com.actelion.research.mapReduceGeneric.executors.IMapReduceExecutor;
import com.actelion.research.mapReduceGeneric.executors.MapReduceExecutorLocalMultiCore;
import com.actelion.research.mapReduceGeneric.utils.Helpers;
import com.actelion.research.mapReduceGeneric.utils.KeyValue;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.*;

/**
 * Wordcount demo (it seems that every map reduce framework must have s.th. like this).
 * It reads numURLs random wikipedia pages and outputs the 20 most frequent words.
 * 

* Please don't run this too ofter with numURLs set to a high value to not block Wikipedia... ! */ public class WordCount implements IMapReduce { private static int numURLs = 100; private String[] stopWords = new String[]{"this", "not", "or", "do", "does", "you", "with", "from", "this", "was", "were", "for"}; public List> map(String element) { List> wordList = new ArrayList>(); try { String content = getRedirectedContentStr(new URL(element)); StringTokenizer tokenizer = new StringTokenizer(content, " "); while (tokenizer.hasMoreTokens()) { String word = tokenizer.nextToken().trim(); if (accept(word)) { //System.out.println("word: "+word); wordList.add(new KeyValue(word, 1)); } } } catch (Exception e) { e.printStackTrace(); throw new RuntimeException("WordCount error: " + e.getMessage()); // important: throw a RuntimeException here so that the mapReduce framework reschedules this job } return wordList; } public Integer reduce(String key, List valueList) { int cnt = 0; for (Integer v : valueList) { cnt += v; } return cnt; } public Collection parseParams(String s) { return Helpers.parseParamsString(s); } public String serializeParam(String element) { return element; } private boolean accept(String s) { if (s == null) return false; if (s.length() < 5) return false; if (s.contains("<") || s.contains(">") || s.contains("\"") || s.contains(":") || s.contains("=") || s.contains(",") || s.contains(";") || s.contains(".") || s.contains("/") || s.contains("\\") || s.contains("(") || s.contains(")")) return false; for (String stop : stopWords) { if (s.equalsIgnoreCase(stop)) return false; } return true; } public String getRedirectedContentStr(URL url) { StringBuilder sb = new StringBuilder(); BufferedReader in = null; try { HttpURLConnection conn = (HttpURLConnection) url.openConnection(); String newUrl = conn.getHeaderField("Location"); // the random page will redirect us... conn.disconnect(); URL url2 = newUrl != null ? new URL(newUrl) : url; in = new BufferedReader( new InputStreamReader(url2.openStream())); String inputLine; while ((inputLine = in.readLine()) != null) { sb.append(inputLine + "\n"); } in.close(); } catch (IOException e) { e.printStackTrace(); } finally { try { if (in != null) in.close(); } catch (Exception e) { } } return sb.toString(); } public static void main(String[] args) throws Exception { List urlList = new ArrayList(numURLs); for (int i = 0; i < numURLs; i++) { urlList.add("http://en.wikipedia.org/wiki/Special:Random"); } //IMapReduceExecutor executor = new MapReduceExecutorLocal(); IMapReduceExecutor executor = new MapReduceExecutorLocalMultiCore(); Map wordCountMap = executor.execute(urlList, new WordCount()); // output most frequent words List> wordCountList = new ArrayList>(wordCountMap.size()); for (String s : wordCountMap.keySet()) { wordCountList.add(new KeyValue(s, wordCountMap.get(s))); } Collections.sort(wordCountList, new Comparator>() { public int compare(KeyValue o1, KeyValue o2) { return o2.getValue().compareTo(o1.getValue()); } }); System.out.println("Most frequent words:"); for (int i = 0; i < 20; i++) { KeyValue kv = wordCountList.get(i); System.out.println(kv.getKey() + ": " + kv.getValue()); } } }





© 2015 - 2024 Weber Informatics LLC | Privacy Policy