org.apache.hadoop.tools.rumen.package-info Maven / Gradle / Ivy

Go to download
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/** Rumen is a data extraction and analysis tool built for 
 * Apache Hadoop. Rumen mines job history
 * logs to extract meaningful data and stores it into an easily-parsed format.
 * 
 * The default output format of Rumen is JSON.
 * Rumen uses the Jackson library to 
 * create JSON objects.
 * 


 * 
 * The following classes can be used to programmatically invoke Rumen:
 * 
 *   
 *    {@link org.apache.hadoop.tools.rumen.JobConfigurationParser}

 *      A parser to parse and filter out interesting properties from job 
 *      configuration.
 *      
 *      


 *      Sample code:
 *       *      
 *        // An example to parse and filter out job name
 *        
 *        String conf_filename = .. // assume the job configuration filename here
 *        
 *        // construct a list of interesting properties
 *        List<String> interestedProperties = new ArrayList<String>();
 *        interestedProperties.add("mapreduce.job.name");
 *        
 *        JobConfigurationParser jcp = 
 *          new JobConfigurationParser(interestedProperties);
 *
 *        InputStream in = new FileInputStream(conf_filename);
 *        Properties parsedProperties = jcp.parse(in);
 *     
 *     
 *     Some of the commonly used interesting properties are enumerated in 
 *     {@link org.apache.hadoop.tools.rumen.JobConfPropertyNames}. 


 *     
 *     Note:
 *        A single instance of {@link org.apache.hadoop.tools.rumen.JobConfigurationParser} 
 *        can be used to parse multiple job configuration files. 
 *     
 *   
 *   
 *    {@link org.apache.hadoop.tools.rumen.JobHistoryParser} 

 *      A parser that parses job history files. It is an interface and actual 
 *      implementations are defined as Enum in 
 *      {@link org.apache.hadoop.tools.rumen.JobHistoryParserFactory}. Note that
 *      {@link org.apache.hadoop.tools.rumen.RewindableInputStream}

 *      is a wrapper class around {@link java.io.InputStream} to make the input 
 *      stream rewindable.
 *      
 *      

 *      Sample code:
 *       *      
 *        // An example to parse a current job history file i.e a job history 
 *        // file for which the version is known
 *        
 *        String filename = .. // assume the job history filename here
 *        
 *        InputStream in = new FileInputStream(filename);
 *        
 *        HistoryEvent event = null;
 *        
 *        JobHistoryParser parser = new CurrentJHParser(in);
 *        
 *        event = parser.nextEvent();
 *        // process all the events
 *        while (event != null) {
 *          // ... process all event
 *          event = parser.nextEvent();
 *        }
 *        
 *        // close the parser and the underlying stream
 *        parser.close();
 *      
 *      
 *      
 *      {@link org.apache.hadoop.tools.rumen.JobHistoryParserFactory} provides a 
 *      {@link org.apache.hadoop.tools.rumen.JobHistoryParserFactory#getParser(org.apache.hadoop.tools.rumen.RewindableInputStream)}
 *      API to get a parser for parsing the job history file. Note that this
 *      API can be used if the job history version is unknown.


 *      Sample code:
 *       *      
 *        // An example to parse a job history for which the version is not 
 *        // known i.e using JobHistoryParserFactory.getParser()
 *        
 *        String filename = .. // assume the job history filename here
 *        
 *        InputStream in = new FileInputStream(filename);
 *        RewindableInputStream ris = new RewindableInputStream(in);
 *        
 *        // JobHistoryParserFactory will check and return a parser that can
 *        // parse the file
 *        JobHistoryParser parser = JobHistoryParserFactory.getParser(ris);
 *        
 *        // now use the parser to parse the events
 *        HistoryEvent event = parser.nextEvent();
 *        while (event != null) {
 *          // ... process the event
 *          event = parser.nextEvent();
 *        }
 *        
 *        parser.close();
 *      
 *      
 *      Note:
 *        Create one instance to parse a job history log and close it after use.
 *  
 *  
 *    {@link org.apache.hadoop.tools.rumen.TopologyBuilder}

 *      Builds the cluster topology based on the job history events. Every 
 *      job history file consists of events. Each event can be represented using
 *      {@link org.apache.hadoop.mapreduce.jobhistory.HistoryEvent}. 
 *      These events can be passed to {@link org.apache.hadoop.tools.rumen.TopologyBuilder} using 
 *      {@link org.apache.hadoop.tools.rumen.TopologyBuilder#process(org.apache.hadoop.mapreduce.jobhistory.HistoryEvent)}.
 *      A cluster topology can be represented using {@link org.apache.hadoop.tools.rumen.LoggedNetworkTopology}.
 *      Once all the job history events are processed, the cluster 
 *      topology can be obtained using {@link org.apache.hadoop.tools.rumen.TopologyBuilder#build()}.
 *      
 *      


 *      Sample code:
 *       *      
 *        // Building topology for a job history file represented using 
 *        // 'filename' and the corresponding configuration file represented 
 *        // using 'conf_filename'
 *        String filename = .. // assume the job history filename here
 *        String conf_filename = .. // assume the job configuration filename here
 *        
 *        InputStream jobConfInputStream = new FileInputStream(filename);
 *        InputStream jobHistoryInputStream = new FileInputStream(conf_filename);
 *        
 *        TopologyBuilder tb = new TopologyBuilder();
 *        
 *        // construct a list of interesting properties
 *        List<String> interestingProperties = new ArrayList%lt;String>();
 *        // add the interesting properties here
 *        interestingProperties.add("mapreduce.job.name");
 *        
 *        JobConfigurationParser jcp = 
 *          new JobConfigurationParser(interestingProperties);
 *        
 *        // parse the configuration file
 *        tb.process(jcp.parse(jobConfInputStream));
 *        
 *        // read the job history file and pass it to the 
 *        // TopologyBuilder.
 *        JobHistoryParser parser = new CurrentJHParser(jobHistoryInputStream);
 *        HistoryEvent e;
 *        
 *        // read and process all the job history events
 *        while ((e = parser.nextEvent()) != null) {
 *          tb.process(e);
 *        }
 *        
 *        LoggedNetworkTopology topology = tb.build();
 *      
 *      
 *  
 *  
 *    {@link org.apache.hadoop.tools.rumen.JobBuilder}

 *      Summarizes a job history file.
 *      {@link org.apache.hadoop.tools.rumen.JobHistoryUtils} provides  
 *      {@link org.apache.hadoop.tools.rumen.JobHistoryUtils#extractJobID(String)} 
 *      API for extracting job id from job history or job configuration files
 *      which can be used for instantiating {@link org.apache.hadoop.tools.rumen.JobBuilder}. 
 *      {@link org.apache.hadoop.tools.rumen.JobBuilder} generates a 
 *      {@link org.apache.hadoop.tools.rumen.LoggedJob} object via 
 *      {@link org.apache.hadoop.tools.rumen.JobBuilder#build()}. 
 *      See {@link org.apache.hadoop.tools.rumen.LoggedJob} for more details.
 *      
 *      


 *      Sample code:
 *       *      
 *        // An example to summarize a current job history file 'filename'
 *        // and the corresponding configuration file 'conf_filename'
 *        
 *        String filename = .. // assume the job history filename here
 *        String conf_filename = .. // assume the job configuration filename here
 *        
 *        InputStream jobConfInputStream = new FileInputStream(job_filename);
 *        InputStream jobHistoryInputStream = new FileInputStream(conf_filename);
 *        
 *        String jobID = TraceBuilder.extractJobID(job_filename);
 *        JobBuilder jb = new JobBuilder(jobID);
 *        
 *        // construct a list of interesting properties
 *        List<String> interestingProperties = new ArrayList%lt;String>();
 *        // add the interesting properties here
 *        interestingProperties.add("mapreduce.job.name");
 *        
 *        JobConfigurationParser jcp = 
 *          new JobConfigurationParser(interestingProperties);
 *        
 *        // parse the configuration file
 *        jb.process(jcp.parse(jobConfInputStream));
 *        
 *        // parse the job history file
 *        JobHistoryParser parser = new CurrentJHParser(jobHistoryInputStream);
 *        try {
 *          HistoryEvent e;
 *          // read and process all the job history events
 *          while ((e = parser.nextEvent()) != null) {
 *            jobBuilder.process(e);
 *          }
 *        } finally {
 *          parser.close();
 *        }
 *        
 *        LoggedJob job = jb.build();
 *      
 *      
 *     Note:
 *       The order of parsing the job configuration file or job history file is 
 *       not important. Create one instance to parse the history file and job 
 *       configuration.
 *   
 *   
 *    {@link org.apache.hadoop.tools.rumen.DefaultOutputter}

 *      Implements {@link org.apache.hadoop.tools.rumen.Outputter} and writes 
 *      JSON object in text format to the output file. 
 *      {@link org.apache.hadoop.tools.rumen.DefaultOutputter} can be 
 *      initialized with the output filename.
 *      
 *      


 *      Sample code:  
 *       *      
 *        // An example to summarize a current job history file represented by
 *        // 'filename' and the configuration filename represented using 
 *        // 'conf_filename'. Also output the job summary to 'out.json' along 
 *        // with the cluster topology to 'topology.json'.
 *        
 *        String filename = .. // assume the job history filename here
 *        String conf_filename = .. // assume the job configuration filename here
 *        
 *        Configuration conf = new Configuration();
 *        DefaultOutputter do = new DefaultOutputter();
 *        do.init("out.json", conf);
 *        
 *        InputStream jobConfInputStream = new FileInputStream(filename);
 *        InputStream jobHistoryInputStream = new FileInputStream(conf_filename);
 *        
 *        // extract the job-id from the filename
 *        String jobID = TraceBuilder.extractJobID(filename);
 *        JobBuilder jb = new JobBuilder(jobID);
 *        TopologyBuilder tb = new TopologyBuilder();
 *        
 *        // construct a list of interesting properties
 *        List<String> interestingProperties = new ArrayList%lt;String>();
 *        // add the interesting properties here
 *        interestingProperties.add("mapreduce.job.name");
 *        
 *        JobConfigurationParser jcp =
 *          new JobConfigurationParser(interestingProperties);
 *          
 *        // parse the configuration file
 *        tb.process(jcp.parse(jobConfInputStream));
 *        
 *        // read the job history file and pass it to the
 *        // TopologyBuilder.
 *        JobHistoryParser parser = new CurrentJHParser(jobHistoryInputStream);
 *        HistoryEvent e;
 *        while ((e = parser.nextEvent()) != null) {
 *          jb.process(e);
 *          tb.process(e);
 *        }
 *        
 *        LoggedJob j = jb.build();
 *        
 *        // serialize the job summary in json (text) format
 *        do.output(j);
 *        
 *        // close
 *        do.close();
 *        
 *        do.init("topology.json", conf);
 *        
 *        // get the job summary using TopologyBuilder
 *        LoggedNetworkTopology topology = topologyBuilder.build();
 *        
 *        // serialize the cluster topology in json (text) format
 *        do.output(topology);
 *        
 *        // close
 *        do.close();
 *      
 *      
 *   
 *   
 *    {@link org.apache.hadoop.tools.rumen.JobTraceReader}

 *      A reader for reading {@link org.apache.hadoop.tools.rumen.LoggedJob} serialized using 
 *      {@link org.apache.hadoop.tools.rumen.DefaultOutputter}. {@link org.apache.hadoop.tools.rumen.LoggedJob} 
 *      provides various APIs for extracting job details. Following are the most
 *      commonly used ones
 *        
 *          {@link org.apache.hadoop.tools.rumen.LoggedJob#getMapTasks()} : Get the map tasks
 *          {@link org.apache.hadoop.tools.rumen.LoggedJob#getReduceTasks()} : Get the reduce tasks
 *          {@link org.apache.hadoop.tools.rumen.LoggedJob#getOtherTasks()} : Get the setup/cleanup tasks
 *          {@link org.apache.hadoop.tools.rumen.LoggedJob#getOutcome()} : Get the job's outcome
 *          {@link org.apache.hadoop.tools.rumen.LoggedJob#getSubmitTime()} : Get the job's submit time
 *          {@link org.apache.hadoop.tools.rumen.LoggedJob#getFinishTime()} : Get the job's finish time
 *        
 *        
 *      


 *      Sample code:
 *       *      
 *        // An example to read job summary from a trace file 'out.json'.
 *        JobTraceReader reader = new JobTracerReader("out.json");
 *        LoggedJob job = reader.getNext();
 *        while (job != null) {
 *          // .... process job level information
 *          for (LoggedTask task : job.getMapTasks()) {
 *            // process all the map tasks in the job
 *            for (LoggedTaskAttempt attempt : task.getAttempts()) {
 *              // process all the map task attempts in the job
 *            }
 *          }
 *          
 *          // get the next job
 *          job = reader.getNext();
 *        }
 *        reader.close();
 *      
 *               
 *   
 *   
 *    {@link org.apache.hadoop.tools.rumen.ClusterTopologyReader}

 *      A reader to read {@link org.apache.hadoop.tools.rumen.LoggedNetworkTopology} serialized using 
 *      {@link org.apache.hadoop.tools.rumen.DefaultOutputter}. {@link org.apache.hadoop.tools.rumen.ClusterTopologyReader} can be 
 *      initialized using the serialized topology filename. 
 *      {@link org.apache.hadoop.tools.rumen.ClusterTopologyReader#get()} can
 *      be used to get the 
 *      {@link org.apache.hadoop.tools.rumen.LoggedNetworkTopology}. 
 *      
 *      


 *      Sample code:
 *       *      
 *        // An example to read the cluster topology from a topology output file
 *        // 'topology.json'
 *        ClusterTopologyReader reader = new ClusterTopologyReader("topology.json");
 *        LoggedNetworkTopology topology  = reader.get();
 *        for (LoggedNetworkTopology t : topology.getChildren()) {
 *          // process the cluster topology
 *        }
 *        reader.close();
 *      
 *      
 *   
 *      
 */

package org.apache.hadoop.tools.rumen;