001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    package org.apache.hadoop.mapreduce.tools;
019    
020    import java.io.IOException;
021    import java.io.PrintWriter;
022    import java.util.ArrayList;
023    import java.util.List;
024    
025    import org.apache.commons.logging.Log;
026    import org.apache.commons.logging.LogFactory;
027    import org.apache.hadoop.classification.InterfaceAudience;
028    import org.apache.hadoop.classification.InterfaceStability;
029    import org.apache.hadoop.classification.InterfaceAudience.Private;
030    import org.apache.hadoop.conf.Configuration;
031    import org.apache.hadoop.conf.Configured;
032    import org.apache.hadoop.ipc.RemoteException;
033    import org.apache.hadoop.mapred.JobConf;
034    import org.apache.hadoop.mapred.TIPStatus;
035    import org.apache.hadoop.mapreduce.Cluster;
036    import org.apache.hadoop.mapreduce.Counters;
037    import org.apache.hadoop.mapreduce.Job;
038    import org.apache.hadoop.mapreduce.JobID;
039    import org.apache.hadoop.mapreduce.JobPriority;
040    import org.apache.hadoop.mapreduce.JobStatus;
041    import org.apache.hadoop.mapreduce.TaskAttemptID;
042    import org.apache.hadoop.mapreduce.TaskCompletionEvent;
043    import org.apache.hadoop.mapreduce.TaskReport;
044    import org.apache.hadoop.mapreduce.TaskTrackerInfo;
045    import org.apache.hadoop.mapreduce.TaskType;
046    import org.apache.hadoop.mapreduce.jobhistory.HistoryViewer;
047    import org.apache.hadoop.mapreduce.v2.LogParams;
048    import org.apache.hadoop.security.AccessControlException;
049    import org.apache.hadoop.util.ExitUtil;
050    import org.apache.hadoop.util.Tool;
051    import org.apache.hadoop.util.ToolRunner;
052    import org.apache.hadoop.yarn.logaggregation.LogDumper;
053    
054    /**
055     * Interprets the map reduce cli options 
056     */
057    @InterfaceAudience.Public
058    @InterfaceStability.Stable
059    public class CLI extends Configured implements Tool {
060      private static final Log LOG = LogFactory.getLog(CLI.class);
061      protected Cluster cluster;
062    
063      public CLI() {
064      }
065      
066      public CLI(Configuration conf) {
067        setConf(conf);
068      }
069      
070      public int run(String[] argv) throws Exception {
071        int exitCode = -1;
072        if (argv.length < 1) {
073          displayUsage("");
074          return exitCode;
075        }    
076        // process arguments
077        String cmd = argv[0];
078        String submitJobFile = null;
079        String jobid = null;
080        String taskid = null;
081        String historyFile = null;
082        String counterGroupName = null;
083        String counterName = null;
084        JobPriority jp = null;
085        String taskType = null;
086        String taskState = null;
087        int fromEvent = 0;
088        int nEvents = 0;
089        boolean getStatus = false;
090        boolean getCounter = false;
091        boolean killJob = false;
092        boolean listEvents = false;
093        boolean viewHistory = false;
094        boolean viewAllHistory = false;
095        boolean listJobs = false;
096        boolean listAllJobs = false;
097        boolean listActiveTrackers = false;
098        boolean listBlacklistedTrackers = false;
099        boolean displayTasks = false;
100        boolean killTask = false;
101        boolean failTask = false;
102        boolean setJobPriority = false;
103        boolean logs = false;
104    
105        if ("-submit".equals(cmd)) {
106          if (argv.length != 2) {
107            displayUsage(cmd);
108            return exitCode;
109          }
110          submitJobFile = argv[1];
111        } else if ("-status".equals(cmd)) {
112          if (argv.length != 2) {
113            displayUsage(cmd);
114            return exitCode;
115          }
116          jobid = argv[1];
117          getStatus = true;
118        } else if("-counter".equals(cmd)) {
119          if (argv.length != 4) {
120            displayUsage(cmd);
121            return exitCode;
122          }
123          getCounter = true;
124          jobid = argv[1];
125          counterGroupName = argv[2];
126          counterName = argv[3];
127        } else if ("-kill".equals(cmd)) {
128          if (argv.length != 2) {
129            displayUsage(cmd);
130            return exitCode;
131          }
132          jobid = argv[1];
133          killJob = true;
134        } else if ("-set-priority".equals(cmd)) {
135          if (argv.length != 3) {
136            displayUsage(cmd);
137            return exitCode;
138          }
139          jobid = argv[1];
140          try {
141            jp = JobPriority.valueOf(argv[2]); 
142          } catch (IllegalArgumentException iae) {
143            LOG.info(iae);
144            displayUsage(cmd);
145            return exitCode;
146          }
147          setJobPriority = true; 
148        } else if ("-events".equals(cmd)) {
149          if (argv.length != 4) {
150            displayUsage(cmd);
151            return exitCode;
152          }
153          jobid = argv[1];
154          fromEvent = Integer.parseInt(argv[2]);
155          nEvents = Integer.parseInt(argv[3]);
156          listEvents = true;
157        } else if ("-history".equals(cmd)) {
158          if (argv.length != 2 && !(argv.length == 3 && "all".equals(argv[1]))) {
159             displayUsage(cmd);
160             return exitCode;
161          }
162          viewHistory = true;
163          if (argv.length == 3 && "all".equals(argv[1])) {
164            viewAllHistory = true;
165            historyFile = argv[2];
166          } else {
167            historyFile = argv[1];
168          }
169        } else if ("-list".equals(cmd)) {
170          if (argv.length != 1 && !(argv.length == 2 && "all".equals(argv[1]))) {
171            displayUsage(cmd);
172            return exitCode;
173          }
174          if (argv.length == 2 && "all".equals(argv[1])) {
175            listAllJobs = true;
176          } else {
177            listJobs = true;
178          }
179        } else if("-kill-task".equals(cmd)) {
180          if (argv.length != 2) {
181            displayUsage(cmd);
182            return exitCode;
183          }
184          killTask = true;
185          taskid = argv[1];
186        } else if("-fail-task".equals(cmd)) {
187          if (argv.length != 2) {
188            displayUsage(cmd);
189            return exitCode;
190          }
191          failTask = true;
192          taskid = argv[1];
193        } else if ("-list-active-trackers".equals(cmd)) {
194          if (argv.length != 1) {
195            displayUsage(cmd);
196            return exitCode;
197          }
198          listActiveTrackers = true;
199        } else if ("-list-blacklisted-trackers".equals(cmd)) {
200          if (argv.length != 1) {
201            displayUsage(cmd);
202            return exitCode;
203          }
204          listBlacklistedTrackers = true;
205        } else if ("-list-attempt-ids".equals(cmd)) {
206          if (argv.length != 4) {
207            displayUsage(cmd);
208            return exitCode;
209          }
210          jobid = argv[1];
211          taskType = argv[2];
212          taskState = argv[3];
213          displayTasks = true;
214        } else if ("-logs".equals(cmd)) {
215          if (argv.length == 2 || argv.length ==3) {
216            logs = true;
217            jobid = argv[1];
218            if (argv.length == 3) {
219              taskid = argv[2];
220            }  else {
221              taskid = null;
222            }
223          } else {
224            displayUsage(cmd);
225            return exitCode;
226          }
227        } else {
228          displayUsage(cmd);
229          return exitCode;
230        }
231    
232        // initialize cluster
233        cluster = new Cluster(getConf());
234            
235        // Submit the request
236        try {
237          if (submitJobFile != null) {
238            Job job = Job.getInstance(new JobConf(submitJobFile));
239            job.submit();
240            System.out.println("Created job " + job.getJobID());
241            exitCode = 0;
242          } else if (getStatus) {
243            Job job = cluster.getJob(JobID.forName(jobid));
244            if (job == null) {
245              System.out.println("Could not find job " + jobid);
246            } else {
247              Counters counters = job.getCounters();
248              System.out.println();
249              System.out.println(job);
250              if (counters != null) {
251                System.out.println(counters);
252              } else {
253                System.out.println("Counters not available. Job is retired.");
254              }
255              exitCode = 0;
256            }
257          } else if (getCounter) {
258            Job job = cluster.getJob(JobID.forName(jobid));
259            if (job == null) {
260              System.out.println("Could not find job " + jobid);
261            } else {
262              Counters counters = job.getCounters();
263              if (counters == null) {
264                System.out.println("Counters not available for retired job " + 
265                jobid);
266                exitCode = -1;
267              } else {
268                System.out.println(getCounter(counters,
269                  counterGroupName, counterName));
270                exitCode = 0;
271              }
272            }
273          } else if (killJob) {
274            Job job = cluster.getJob(JobID.forName(jobid));
275            if (job == null) {
276              System.out.println("Could not find job " + jobid);
277            } else {
278              job.killJob();
279              System.out.println("Killed job " + jobid);
280              exitCode = 0;
281            }
282          } else if (setJobPriority) {
283            Job job = cluster.getJob(JobID.forName(jobid));
284            if (job == null) {
285              System.out.println("Could not find job " + jobid);
286            } else {
287              job.setPriority(jp);
288              System.out.println("Changed job priority.");
289              exitCode = 0;
290            } 
291          } else if (viewHistory) {
292            viewHistory(historyFile, viewAllHistory);
293            exitCode = 0;
294          } else if (listEvents) {
295            listEvents(cluster.getJob(JobID.forName(jobid)), fromEvent, nEvents);
296            exitCode = 0;
297          } else if (listJobs) {
298            listJobs(cluster);
299            exitCode = 0;
300          } else if (listAllJobs) {
301            listAllJobs(cluster);
302            exitCode = 0;
303          } else if (listActiveTrackers) {
304            listActiveTrackers(cluster);
305            exitCode = 0;
306          } else if (listBlacklistedTrackers) {
307            listBlacklistedTrackers(cluster);
308            exitCode = 0;
309          } else if (displayTasks) {
310            displayTasks(cluster.getJob(JobID.forName(jobid)), taskType, taskState);
311            exitCode = 0;
312          } else if(killTask) {
313            TaskAttemptID taskID = TaskAttemptID.forName(taskid);
314            Job job = cluster.getJob(taskID.getJobID());
315            if (job == null) {
316              System.out.println("Could not find job " + jobid);
317            } else if (job.killTask(taskID)) {
318              System.out.println("Killed task " + taskid);
319              exitCode = 0;
320            } else {
321              System.out.println("Could not kill task " + taskid);
322              exitCode = -1;
323            }
324          } else if(failTask) {
325            TaskAttemptID taskID = TaskAttemptID.forName(taskid);
326            Job job = cluster.getJob(taskID.getJobID());
327            if (job == null) {
328                System.out.println("Could not find job " + jobid);
329            } else if(job.failTask(taskID)) {
330              System.out.println("Killed task " + taskID + " by failing it");
331              exitCode = 0;
332            } else {
333              System.out.println("Could not fail task " + taskid);
334              exitCode = -1;
335            }
336          } else if (logs) {
337            try {
338            JobID jobID = JobID.forName(jobid);
339            TaskAttemptID taskAttemptID = TaskAttemptID.forName(taskid);
340            LogParams logParams = cluster.getLogParams(jobID, taskAttemptID);
341            LogDumper logDumper = new LogDumper();
342            logDumper.setConf(getConf());
343            exitCode = logDumper.dumpAContainersLogs(logParams.getApplicationId(),
344                logParams.getContainerId(), logParams.getNodeId(),
345                logParams.getOwner());
346            } catch (IOException e) {
347              if (e instanceof RemoteException) {
348                throw e;
349              } 
350              System.out.println(e.getMessage());
351            }
352          }
353        } catch (RemoteException re) {
354          IOException unwrappedException = re.unwrapRemoteException();
355          if (unwrappedException instanceof AccessControlException) {
356            System.out.println(unwrappedException.getMessage());
357          } else {
358            throw re;
359          }
360        } finally {
361          cluster.close();
362        }
363        return exitCode;
364      }
365    
366      private String getJobPriorityNames() {
367        StringBuffer sb = new StringBuffer();
368        for (JobPriority p : JobPriority.values()) {
369          sb.append(p.name()).append(" ");
370        }
371        return sb.substring(0, sb.length()-1);
372      }
373    
374      private String getTaskTypess() {
375        StringBuffer sb = new StringBuffer();
376        for (TaskType t : TaskType.values()) {
377          sb.append(t.name()).append(" ");
378        }
379        return sb.substring(0, sb.length()-1);
380      }
381    
382      /**
383       * Display usage of the command-line tool and terminate execution.
384       */
385      private void displayUsage(String cmd) {
386        String prefix = "Usage: CLI ";
387        String jobPriorityValues = getJobPriorityNames();
388        String taskTypes = getTaskTypess();
389        String taskStates = "running, completed";
390        if ("-submit".equals(cmd)) {
391          System.err.println(prefix + "[" + cmd + " <job-file>]");
392        } else if ("-status".equals(cmd) || "-kill".equals(cmd)) {
393          System.err.println(prefix + "[" + cmd + " <job-id>]");
394        } else if ("-counter".equals(cmd)) {
395          System.err.println(prefix + "[" + cmd + 
396            " <job-id> <group-name> <counter-name>]");
397        } else if ("-events".equals(cmd)) {
398          System.err.println(prefix + "[" + cmd + 
399            " <job-id> <from-event-#> <#-of-events>]. Event #s start from 1.");
400        } else if ("-history".equals(cmd)) {
401          System.err.println(prefix + "[" + cmd + " <jobHistoryFile>]");
402        } else if ("-list".equals(cmd)) {
403          System.err.println(prefix + "[" + cmd + " [all]]");
404        } else if ("-kill-task".equals(cmd) || "-fail-task".equals(cmd)) {
405          System.err.println(prefix + "[" + cmd + " <task-attempt-id>]");
406        } else if ("-set-priority".equals(cmd)) {
407          System.err.println(prefix + "[" + cmd + " <job-id> <priority>]. " +
408              "Valid values for priorities are: " 
409              + jobPriorityValues); 
410        } else if ("-list-active-trackers".equals(cmd)) {
411          System.err.println(prefix + "[" + cmd + "]");
412        } else if ("-list-blacklisted-trackers".equals(cmd)) {
413          System.err.println(prefix + "[" + cmd + "]");
414        } else if ("-list-attempt-ids".equals(cmd)) {
415          System.err.println(prefix + "[" + cmd + 
416              " <job-id> <task-type> <task-state>]. " +
417              "Valid values for <task-type> are " + taskTypes + ". " +
418              "Valid values for <task-state> are " + taskStates);
419        } else if ("-logs".equals(cmd)) {
420          System.err.println(prefix + "[" + cmd +
421              " <job-id> <task-attempt-id>]. " +
422              " <task-attempt-id> is optional to get task attempt logs.");      
423        } else {
424          System.err.printf(prefix + "<command> <args>\n");
425          System.err.printf("\t[-submit <job-file>]\n");
426          System.err.printf("\t[-status <job-id>]\n");
427          System.err.printf("\t[-counter <job-id> <group-name> <counter-name>]\n");
428          System.err.printf("\t[-kill <job-id>]\n");
429          System.err.printf("\t[-set-priority <job-id> <priority>]. " +
430            "Valid values for priorities are: " + jobPriorityValues + "\n");
431          System.err.printf("\t[-events <job-id> <from-event-#> <#-of-events>]\n");
432          System.err.printf("\t[-history <jobHistoryFile>]\n");
433          System.err.printf("\t[-list [all]]\n");
434          System.err.printf("\t[-list-active-trackers]\n");
435          System.err.printf("\t[-list-blacklisted-trackers]\n");
436          System.err.println("\t[-list-attempt-ids <job-id> <task-type> " +
437            "<task-state>]. " +
438            "Valid values for <task-type> are " + taskTypes + ". " +
439            "Valid values for <task-state> are " + taskStates);
440          System.err.printf("\t[-kill-task <task-attempt-id>]\n");
441          System.err.printf("\t[-fail-task <task-attempt-id>]\n");
442          System.err.printf("\t[-logs <job-id> <task-attempt-id>]\n\n");
443          ToolRunner.printGenericCommandUsage(System.out);
444        }
445      }
446        
447      private void viewHistory(String historyFile, boolean all) 
448        throws IOException {
449        HistoryViewer historyViewer = new HistoryViewer(historyFile,
450                                            getConf(), all);
451        historyViewer.print();
452      }
453    
454      protected long getCounter(Counters counters, String counterGroupName,
455          String counterName) throws IOException {
456        return counters.findCounter(counterGroupName, counterName).getValue();
457      }
458      
459      /**
460       * List the events for the given job
461       * @param jobId the job id for the job's events to list
462       * @throws IOException
463       */
464      private void listEvents(Job job, int fromEventId, int numEvents)
465          throws IOException, InterruptedException {
466        TaskCompletionEvent[] events = job.
467          getTaskCompletionEvents(fromEventId, numEvents);
468        System.out.println("Task completion events for " + job.getJobID());
469        System.out.println("Number of events (from " + fromEventId + ") are: " 
470          + events.length);
471        for(TaskCompletionEvent event: events) {
472          System.out.println(event.getStatus() + " " + 
473            event.getTaskAttemptId() + " " + 
474            getTaskLogURL(event.getTaskAttemptId(), event.getTaskTrackerHttp()));
475        }
476      }
477    
478      protected static String getTaskLogURL(TaskAttemptID taskId, String baseUrl) {
479        return (baseUrl + "/tasklog?plaintext=true&attemptid=" + taskId); 
480      }
481      
482    
483      /**
484       * Dump a list of currently running jobs
485       * @throws IOException
486       */
487      private void listJobs(Cluster cluster) 
488          throws IOException, InterruptedException {
489        List<JobStatus> runningJobs = new ArrayList<JobStatus>();
490        for (JobStatus job : cluster.getAllJobStatuses()) {
491          if (!job.isJobComplete()) {
492            runningJobs.add(job);
493          }
494        }
495        displayJobList(runningJobs.toArray(new JobStatus[0]));
496      }
497        
498      /**
499       * Dump a list of all jobs submitted.
500       * @throws IOException
501       */
502      private void listAllJobs(Cluster cluster) 
503          throws IOException, InterruptedException {
504        displayJobList(cluster.getAllJobStatuses());
505      }
506      
507      /**
508       * Display the list of active trackers
509       */
510      private void listActiveTrackers(Cluster cluster) 
511          throws IOException, InterruptedException {
512        TaskTrackerInfo[] trackers = cluster.getActiveTaskTrackers();
513        for (TaskTrackerInfo tracker : trackers) {
514          System.out.println(tracker.getTaskTrackerName());
515        }
516      }
517    
518      /**
519       * Display the list of blacklisted trackers
520       */
521      private void listBlacklistedTrackers(Cluster cluster) 
522          throws IOException, InterruptedException {
523        TaskTrackerInfo[] trackers = cluster.getBlackListedTaskTrackers();
524        if (trackers.length > 0) {
525          System.out.println("BlackListedNode \t Reason");
526        }
527        for (TaskTrackerInfo tracker : trackers) {
528          System.out.println(tracker.getTaskTrackerName() + "\t" + 
529            tracker.getReasonForBlacklist());
530        }
531      }
532    
533      private void printTaskAttempts(TaskReport report) {
534        if (report.getCurrentStatus() == TIPStatus.COMPLETE) {
535          System.out.println(report.getSuccessfulTaskAttemptId());
536        } else if (report.getCurrentStatus() == TIPStatus.RUNNING) {
537          for (TaskAttemptID t : 
538            report.getRunningTaskAttemptIds()) {
539            System.out.println(t);
540          }
541        }
542      }
543    
544      /**
545       * Display the information about a job's tasks, of a particular type and
546       * in a particular state
547       * 
548       * @param job the job
549       * @param type the type of the task (map/reduce/setup/cleanup)
550       * @param state the state of the task 
551       * (pending/running/completed/failed/killed)
552       */
553      protected void displayTasks(Job job, String type, String state) 
554      throws IOException, InterruptedException {
555        TaskReport[] reports = job.getTaskReports(TaskType.valueOf(type));
556        for (TaskReport report : reports) {
557          TIPStatus status = report.getCurrentStatus();
558          if ((state.equals("pending") && status ==TIPStatus.PENDING) ||
559              (state.equals("running") && status ==TIPStatus.RUNNING) ||
560              (state.equals("completed") && status == TIPStatus.COMPLETE) ||
561              (state.equals("failed") && status == TIPStatus.FAILED) ||
562              (state.equals("killed") && status == TIPStatus.KILLED)) {
563            printTaskAttempts(report);
564          }
565        }
566      }
567    
568      public void displayJobList(JobStatus[] jobs) 
569          throws IOException, InterruptedException {
570        displayJobList(jobs, new PrintWriter(System.out));
571      }
572    
573      @Private
574      public static String headerPattern = "%23s\t%10s\t%14s\t%12s\t%12s\t%10s\t%15s\t%15s\t%8s\t%8s\t%10s\t%10s\n";
575      @Private
576      public static String dataPattern   = "%23s\t%10s\t%14d\t%12s\t%12s\t%10s\t%15s\t%15s\t%8s\t%8s\t%10s\t%10s\n";
577      private static String memPattern   = "%dM";
578      private static String UNAVAILABLE  = "N/A";
579    
580      @Private
581      public void displayJobList(JobStatus[] jobs, PrintWriter writer) {
582        writer.println("Total jobs:" + jobs.length);
583        writer.printf(headerPattern, "JobId", "State", "StartTime", "UserName",
584          "Queue", "Priority", "UsedContainers",
585          "RsvdContainers", "UsedMem", "RsvdMem", "NeededMem", "AM info");
586        for (JobStatus job : jobs) {
587          int numUsedSlots = job.getNumUsedSlots();
588          int numReservedSlots = job.getNumReservedSlots();
589          int usedMem = job.getUsedMem();
590          int rsvdMem = job.getReservedMem();
591          int neededMem = job.getNeededMem();
592          writer.printf(dataPattern,
593              job.getJobID().toString(), job.getState(), job.getStartTime(),
594              job.getUsername(), job.getQueue(), 
595              job.getPriority().name(),
596              numUsedSlots < 0 ? UNAVAILABLE : numUsedSlots,
597              numReservedSlots < 0 ? UNAVAILABLE : numReservedSlots,
598              usedMem < 0 ? UNAVAILABLE : String.format(memPattern, usedMem),
599              rsvdMem < 0 ? UNAVAILABLE : String.format(memPattern, rsvdMem),
600              neededMem < 0 ? UNAVAILABLE : String.format(memPattern, neededMem),
601              job.getSchedulingInfo());
602        }
603        writer.flush();
604      }
605      
606      public static void main(String[] argv) throws Exception {
607        int res = ToolRunner.run(new CLI(), argv);
608        ExitUtil.terminate(res);
609      }
610    }