org.apache.hadoop.hive.ql.exec.mr.JobDebugger Maven / Gradle / Ivy
Go to download
Show more of this group Show more artifacts with this name
Show all versions of hive-apache Show documentation
Show all versions of hive-apache Show documentation
Shaded version of Apache Hive for Trino
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hive.ql.exec.mr;
import java.io.IOException;
import java.lang.Exception;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.hive.conf.HiveConf;
import org.apache.hadoop.hive.ql.ErrorMsg;
import org.apache.hadoop.hive.ql.exec.errors.ErrorAndSolution;
import org.apache.hadoop.hive.ql.exec.errors.TaskLogProcessor;
import org.apache.hadoop.hive.ql.session.SessionState.LogHelper;
import org.apache.hadoop.hive.shims.ShimLoader;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.RunningJob;
import org.apache.hadoop.mapred.TaskCompletionEvent;
/**
* JobDebugger takes a RunningJob that has failed and grabs the top 4 failing
* tasks and outputs this information to the Hive CLI.
*/
public class JobDebugger implements Runnable {
private final JobConf conf;
private final RunningJob rj;
private final LogHelper console;
private final Map>> stackTraces;
// Mapping from task ID to the number of failures
private final Map failures = new HashMap();
private final Set successes = new HashSet(); // Successful task ID's
private final Map taskIdToInfo = new HashMap();
private String diagnosticMesg;
private int maxFailures = 0;
// Used for showJobFailDebugInfo
private static class TaskInfo {
String jobId;
Set logUrls;
int errorCode; // Obtained from the HiveException thrown
String[] diagnosticMesgs;
public TaskInfo(String jobId) {
this.jobId = jobId;
logUrls = new HashSet();
errorCode = 0;
diagnosticMesgs = null;
}
public void addLogUrl(String logUrl) {
logUrls.add(logUrl);
}
public void setErrorCode(int errorCode) {
this.errorCode = errorCode;
}
public void setDiagnosticMesgs(String[] diagnosticMesgs) {
this.diagnosticMesgs = diagnosticMesgs;
}
public Set getLogUrls() {
return logUrls;
}
public String getJobId() {
return jobId;
}
public int getErrorCode() {
return errorCode;
}
public String[] getDiagnosticMesgs() {
return diagnosticMesgs;
}
}
public JobDebugger(JobConf conf, RunningJob rj, LogHelper console) {
this.conf = conf;
this.rj = rj;
this.console = console;
this.stackTraces = null;
}
public JobDebugger(JobConf conf, RunningJob rj, LogHelper console,
Map>> stackTraces) {
this.conf = conf;
this.rj = rj;
this.console = console;
this.stackTraces = stackTraces;
}
public void run() {
try {
diagnosticMesg = showJobFailDebugInfo();
} catch (IOException e) {
console.printError(e.getMessage());
}
}
public static int extractErrorCode(String[] diagnostics) {
int result = 0;
Pattern errorCodeRegex = ErrorMsg.getErrorCodePattern();
for (String mesg : diagnostics) {
Matcher matcher = errorCodeRegex.matcher(mesg);
if (matcher.find()) {
result = Integer.parseInt(matcher.group(1));
// We don't exit the loop early because we want to extract the error code
// corresponding to the bottommost error coded exception.
}
}
return result;
}
class TaskInfoGrabber implements Runnable {
public void run() {
try {
getTaskInfos();
} catch (Exception e) {
console.printError(e.getMessage());
}
}
private void getTaskInfos() throws IOException, MalformedURLException {
int startIndex = 0;
while (true) {
TaskCompletionEvent[] taskCompletions = rj.getTaskCompletionEvents(startIndex);
if (taskCompletions == null || taskCompletions.length == 0) {
break;
}
boolean more = true;
boolean firstError = true;
for (TaskCompletionEvent t : taskCompletions) {
// For each task completion event, get the associated task id, job id
// and the logs
String taskId = t.getTaskAttemptId().getTaskID().toString();
String jobId = t.getTaskAttemptId().getJobID().toString();
if (firstError) {
console.printError("Examining task ID: " + taskId + " (and more) from job " + jobId);
firstError = false;
}
TaskInfo ti = taskIdToInfo.get(taskId);
if (ti == null) {
ti = new TaskInfo(jobId);
taskIdToInfo.put(taskId, ti);
}
// These tasks should have come from the same job.
assert (ti.getJobId() != null && ti.getJobId().equals(jobId));
String taskAttemptLogUrl = ShimLoader.getHadoopShims().getTaskAttemptLogUrl(
conf, t.getTaskTrackerHttp(), t.getTaskId());
if (taskAttemptLogUrl != null) {
ti.getLogUrls().add(taskAttemptLogUrl);
}
// If a task failed, fetch its error code (if available).
// Also keep track of the total number of failures for that
// task (typically, a task gets re-run up to 4 times if it fails.
if (t.getTaskStatus() != TaskCompletionEvent.Status.SUCCEEDED) {
String[] diags = rj.getTaskDiagnostics(t.getTaskAttemptId());
ti.setDiagnosticMesgs(diags);
if (ti.getErrorCode() == 0) {
ti.setErrorCode(extractErrorCode(diags));
}
Integer failAttempts = failures.get(taskId);
if (failAttempts == null) {
failAttempts = Integer.valueOf(0);
}
failAttempts = Integer.valueOf(failAttempts.intValue() + 1);
failures.put(taskId, failAttempts);
} else {
successes.add(taskId);
}
}
if (!more) {
break;
}
startIndex += taskCompletions.length;
}
}
}
private void computeMaxFailures() {
maxFailures = 0;
for (Integer failCount : failures.values()) {
if (maxFailures < failCount.intValue()) {
maxFailures = failCount.intValue();
}
}
}
private String showJobFailDebugInfo() throws IOException {
console.printError("Error during job, obtaining debugging information...");
if (!conf.get("mapred.job.tracker", "local").equals("local")) {
// Show Tracking URL for remotely running jobs.
console.printError("Job Tracking URL: " + rj.getTrackingURL());
}
// Loop to get all task completion events because getTaskCompletionEvents
// only returns a subset per call
TaskInfoGrabber tlg = new TaskInfoGrabber();
Thread t = new Thread(tlg);
try {
t.start();
t.join(HiveConf.getIntVar(conf, HiveConf.ConfVars.TASKLOG_DEBUG_TIMEOUT));
} catch (InterruptedException e) {
console.printError("Timed out trying to finish grabbing task log URLs, "
+ "some task info may be missing");
}
// Remove failures for tasks that succeeded
for (String task : successes) {
failures.remove(task);
}
if (failures.keySet().size() == 0) {
return null;
}
// Find the highest failure count
computeMaxFailures() ;
// Display Error Message for tasks with the highest failure count
String jtUrl = null;
try {
jtUrl = JobTrackerURLResolver.getURL(conf);
} catch (Exception e) {
console.printError("Unable to retrieve URL for Hadoop Task logs. "
+ e.getMessage());
}
String msg = null;
for (String task : failures.keySet()) {
if (failures.get(task).intValue() == maxFailures) {
TaskInfo ti = taskIdToInfo.get(task);
String jobId = ti.getJobId();
String taskUrl = (jtUrl == null) ? null :
jtUrl + "/taskdetails.jsp?jobid=" + jobId + "&tipid=" + task.toString();
TaskLogProcessor tlp = new TaskLogProcessor(conf);
for (String logUrl : ti.getLogUrls()) {
tlp.addTaskAttemptLogUrl(logUrl);
}
if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.JOB_DEBUG_CAPTURE_STACKTRACES) &&
stackTraces != null) {
if (!stackTraces.containsKey(jobId)) {
stackTraces.put(jobId, new ArrayList>());
}
stackTraces.get(jobId).addAll(tlp.getStackTraces());
}
if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.SHOW_JOB_FAIL_DEBUG_INFO)) {
List errors = tlp.getErrors();
StringBuilder sb = new StringBuilder();
// We use a StringBuilder and then call printError only once as
// printError will write to both stderr and the error log file. In
// situations where both the stderr and the log file output is
// simultaneously output to a single stream, this will look cleaner.
sb.append("\n");
sb.append("Task with the most failures(" + maxFailures + "): \n");
sb.append("-----\n");
sb.append("Task ID:\n " + task + "\n\n");
if (taskUrl != null) {
sb.append("URL:\n " + taskUrl + "\n");
}
for (ErrorAndSolution e : errors) {
sb.append("\n");
sb.append("Possible error:\n " + e.getError() + "\n\n");
sb.append("Solution:\n " + e.getSolution() + "\n");
}
sb.append("-----\n");
sb.append("Diagnostic Messages for this Task:\n");
String[] diagMesgs = ti.getDiagnosticMesgs();
for (String mesg : diagMesgs) {
sb.append(mesg + "\n");
}
msg = sb.toString();
console.printError(msg);
}
// Only print out one task because that's good enough for debugging.
break;
}
}
return msg;
}
public String getDiagnosticMesg() {
return diagnosticMesg;
}
public int getErrorCode() {
for (String task : failures.keySet()) {
if (failures.get(task).intValue() == maxFailures) {
TaskInfo ti = taskIdToInfo.get(task);
return ti.getErrorCode();
}
}
// Should never reach here unless there were no failed tasks.
return 0;
}
}