001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    
019    package org.apache.hadoop.yarn.applications.distributedshell;
020    
021    import java.io.BufferedReader;
022    import java.io.IOException;
023    import java.io.InputStreamReader;
024    import java.net.InetSocketAddress;
025    import java.net.URI;
026    import java.net.URISyntaxException;
027    import java.util.ArrayList;
028    import java.util.HashMap;
029    import java.util.List;
030    import java.util.Map;
031    import java.util.Vector;
032    import java.util.concurrent.CopyOnWriteArrayList;
033    import java.util.concurrent.atomic.AtomicInteger;
034    
035    import org.apache.commons.cli.CommandLine;
036    import org.apache.commons.cli.GnuParser;
037    import org.apache.commons.cli.HelpFormatter;
038    import org.apache.commons.cli.Options;
039    import org.apache.commons.cli.ParseException;
040    import org.apache.commons.logging.Log;
041    import org.apache.commons.logging.LogFactory;
042    
043    import org.apache.hadoop.classification.InterfaceAudience;
044    import org.apache.hadoop.classification.InterfaceStability;
045    import org.apache.hadoop.conf.Configuration;
046    import org.apache.hadoop.net.NetUtils;
047    import org.apache.hadoop.yarn.api.AMRMProtocol;
048    import org.apache.hadoop.yarn.api.ApplicationConstants;
049    import org.apache.hadoop.yarn.api.ContainerManager;
050    
051    import org.apache.hadoop.yarn.api.protocolrecords.AllocateRequest;
052    import org.apache.hadoop.yarn.api.protocolrecords.AllocateResponse;
053    import org.apache.hadoop.yarn.api.protocolrecords.FinishApplicationMasterRequest;
054    //import org.apache.hadoop.yarn.api.protocolrecords.GetContainerStatusRequest;
055    //import org.apache.hadoop.yarn.api.protocolrecords.GetContainerStatusResponse;
056    import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterRequest;
057    import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterResponse;
058    import org.apache.hadoop.yarn.api.protocolrecords.StartContainerRequest;
059    
060    import org.apache.hadoop.yarn.api.records.AMResponse;
061    import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
062    import org.apache.hadoop.yarn.api.records.Container;
063    import org.apache.hadoop.yarn.api.records.ContainerId;
064    import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
065    import org.apache.hadoop.yarn.api.records.ContainerState;
066    import org.apache.hadoop.yarn.api.records.ContainerStatus;
067    import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
068    import org.apache.hadoop.yarn.api.records.LocalResource;
069    import org.apache.hadoop.yarn.api.records.LocalResourceType;
070    import org.apache.hadoop.yarn.api.records.LocalResourceVisibility;
071    import org.apache.hadoop.yarn.api.records.Priority;
072    import org.apache.hadoop.yarn.api.records.Resource;
073    import org.apache.hadoop.yarn.api.records.ResourceRequest;
074    import org.apache.hadoop.yarn.conf.YarnConfiguration;
075    import org.apache.hadoop.yarn.exceptions.YarnRemoteException;
076    import org.apache.hadoop.yarn.ipc.YarnRPC;
077    import org.apache.hadoop.yarn.util.ConverterUtils;
078    import org.apache.hadoop.yarn.util.Records;
079    
080    /**
081     * An ApplicationMaster for executing shell commands on a set of launched containers using the YARN framework. 
082     * 
083     * <p>This class is meant to act as an example on how to write yarn-based application masters. </p>
084     * 
085     * <p> The ApplicationMaster is started on a container by the <code>ResourceManager</code>'s launcher. 
086     * The first thing that the <code>ApplicationMaster</code> needs to do is to connect and register itself with 
087     * the <code>ResourceManager</code>. The registration sets up information within the <code>ResourceManager</code>
088     * regarding what host:port the ApplicationMaster is listening on to provide any form of functionality to a client
089     * as well as a tracking url that a client can use to keep track of status/job history if needed. </p>
090     * 
091     * <p> The <code>ApplicationMaster</code> needs to send a heartbeat to the <code>ResourceManager</code> at regular intervals
092     * to inform the <code>ResourceManager</code> that it is up and alive. The {@link AMRMProtocol#allocate} to the 
093     * <code>ResourceManager</code> from the <code>ApplicationMaster</code> acts as a heartbeat.
094     * 
095     * <p> For the actual handling of the job, the <code>ApplicationMaster</code> has to request the 
096     * <code>ResourceManager</code> via {@link AllocateRequest} for the required no. of containers using {@link ResourceRequest}
097     * with the necessary resource specifications such as node location, computational (memory/disk/cpu) resource requirements.
098     * The <code>ResourceManager</code> responds with an {@link AllocateResponse} that informs the <code>ApplicationMaster</code> 
099     * of the set of newly allocated containers, completed containers as well as current state of available resources. </p>
100     * 
101     * <p> For each allocated container, the <code>ApplicationMaster</code> can then set up the necessary launch context via 
102     * {@link ContainerLaunchContext} to specify the allocated container id, local resources required by the executable, 
103     * the environment to be setup for the executable, commands to execute, etc. and submit a {@link StartContainerRequest} 
104     * to the {@link ContainerManager} to launch and execute the defined commands on the given allocated container. </p>
105     *  
106     * <p> The <code>ApplicationMaster</code> can monitor the launched container by either querying the <code>ResourceManager</code> 
107     * using {@link AMRMProtocol#allocate} to get updates on completed containers or via the {@link ContainerManager} 
108     * by querying for the status of the allocated container's {@link ContainerId}.
109     * 
110     * <p> After the job has been completed, the <code>ApplicationMaster</code> has to send a {@link FinishApplicationMasterRequest} 
111     * to the <code>ResourceManager</code> to inform it that the <code>ApplicationMaster</code> has been completed. 
112     */
113    @InterfaceAudience.Public
114    @InterfaceStability.Unstable
115    public class ApplicationMaster {
116    
117      private static final Log LOG = LogFactory.getLog(ApplicationMaster.class);
118    
119      // Configuration 
120      private Configuration conf;
121      // YARN RPC to communicate with the Resource Manager or Node Manager
122      private YarnRPC rpc;
123    
124      // Handle to communicate with the Resource Manager
125      private AMRMProtocol resourceManager;
126    
127      // Application Attempt Id ( combination of attemptId and fail count )
128      private ApplicationAttemptId appAttemptID;
129    
130      // TODO
131      // For status update for clients - yet to be implemented
132      // Hostname of the container 
133      private String appMasterHostname = "";
134      // Port on which the app master listens for status update requests from clients
135      private int appMasterRpcPort = 0;
136      // Tracking url to which app master publishes info for clients to monitor 
137      private String appMasterTrackingUrl = "";
138    
139      // App Master configuration
140      // No. of containers to run shell command on
141      private int numTotalContainers = 1;
142      // Memory to request for the container on which the shell command will run 
143      private int containerMemory = 10;
144      // Priority of the request
145      private int requestPriority; 
146    
147      // Incremental counter for rpc calls to the RM
148      private AtomicInteger rmRequestID = new AtomicInteger();
149    
150      // Simple flag to denote whether all works is done
151      private boolean appDone = false; 
152      // Counter for completed containers ( complete denotes successful or failed )
153      private AtomicInteger numCompletedContainers = new AtomicInteger();
154      // Allocated container count so that we know how many containers has the RM
155      // allocated to us
156      private AtomicInteger numAllocatedContainers = new AtomicInteger();
157      // Count of failed containers 
158      private AtomicInteger numFailedContainers = new AtomicInteger();
159      // Count of containers already requested from the RM
160      // Needed as once requested, we should not request for containers again and again. 
161      // Only request for more if the original requirement changes. 
162      private AtomicInteger numRequestedContainers = new AtomicInteger();
163    
164      // Shell command to be executed 
165      private String shellCommand = ""; 
166      // Args to be passed to the shell command
167      private String shellArgs = "";
168      // Env variables to be setup for the shell command 
169      private Map<String, String> shellEnv = new HashMap<String, String>();
170    
171      // Location of shell script ( obtained from info set in env )
172      // Shell script path in fs
173      private String shellScriptPath = ""; 
174      // Timestamp needed for creating a local resource
175      private long shellScriptPathTimestamp = 0;
176      // File length needed for local resource
177      private long shellScriptPathLen = 0;
178    
179      // Hardcoded path to shell script in launch container's local env
180      private final String ExecShellStringPath = "ExecShellScript.sh";
181    
182      // Containers to be released
183      private CopyOnWriteArrayList<ContainerId> releasedContainers = new CopyOnWriteArrayList<ContainerId>();
184    
185      // Launch threads
186      private List<Thread> launchThreads = new ArrayList<Thread>();
187    
188      /**
189       * @param args Command line args
190       */
191      public static void main(String[] args) {
192        boolean result = false;
193        try {
194          ApplicationMaster appMaster = new ApplicationMaster();
195          LOG.info("Initializing ApplicationMaster");
196          boolean doRun = appMaster.init(args);
197          if (!doRun) {
198            System.exit(0);
199          }
200          result = appMaster.run();
201        } catch (Throwable t) {
202          LOG.fatal("Error running ApplicationMaster", t);
203          System.exit(1);
204        }
205        if (result) {
206          LOG.info("Application Master completed successfully. exiting");
207          System.exit(0);
208        }
209        else {
210          LOG.info("Application Master failed. exiting");
211          System.exit(2);
212        }
213      }
214    
215      /**
216       * Dump out contents of $CWD and the environment to stdout for debugging
217       */
218      private void dumpOutDebugInfo() {
219    
220        LOG.info("Dump debug output");
221        Map<String, String> envs = System.getenv();
222        for (Map.Entry<String, String> env : envs.entrySet()) {
223          LOG.info("System env: key=" + env.getKey() + ", val=" + env.getValue());
224          System.out.println("System env: key=" + env.getKey() + ", val=" + env.getValue());
225        }
226    
227        String cmd = "ls -al";
228        Runtime run = Runtime.getRuntime();
229        Process pr = null;
230        try {
231          pr = run.exec(cmd);
232          pr.waitFor();
233    
234          BufferedReader buf = new BufferedReader(new InputStreamReader(pr.getInputStream()));
235          String line = "";
236          while ((line=buf.readLine())!=null) {
237            LOG.info("System CWD content: " + line);
238            System.out.println("System CWD content: " + line);
239          }
240          buf.close();
241        } catch (IOException e) {
242          e.printStackTrace();
243        } catch (InterruptedException e) {
244          e.printStackTrace();
245        } 
246      }
247    
248      public ApplicationMaster() throws Exception {
249        // Set up the configuration and RPC
250        conf = new Configuration();
251        rpc = YarnRPC.create(conf);
252      }
253      /**
254       * Parse command line options
255       * @param args Command line args 
256       * @return Whether init successful and run should be invoked 
257       * @throws ParseException
258       * @throws IOException 
259       */
260      public boolean init(String[] args) throws ParseException, IOException {
261    
262        Options opts = new Options();
263        opts.addOption("app_attempt_id", true, "App Attempt ID. Not to be used unless for testing purposes");
264        opts.addOption("shell_command", true, "Shell command to be executed by the Application Master");
265        opts.addOption("shell_script", true, "Location of the shell script to be executed");
266        opts.addOption("shell_args", true, "Command line args for the shell script");
267        opts.addOption("shell_env", true, "Environment for shell script. Specified as env_key=env_val pairs");
268        opts.addOption("container_memory", true, "Amount of memory in MB to be requested to run the shell command");
269        opts.addOption("num_containers", true, "No. of containers on which the shell command needs to be executed");
270        opts.addOption("priority", true, "Application Priority. Default 0");
271        opts.addOption("debug", false, "Dump out debug information");
272    
273        opts.addOption("help", false, "Print usage");
274        CommandLine cliParser = new GnuParser().parse(opts, args);
275    
276        if (args.length == 0) {
277          printUsage(opts);
278          throw new IllegalArgumentException("No args specified for application master to initialize");
279        }
280    
281        if (cliParser.hasOption("help")) {
282          printUsage(opts);
283          return false;
284        }
285    
286        if (cliParser.hasOption("debug")) {
287          dumpOutDebugInfo();
288        }
289    
290        Map<String, String> envs = System.getenv();
291    
292        appAttemptID = Records.newRecord(ApplicationAttemptId.class);
293        if (envs.containsKey(ApplicationConstants.AM_APP_ATTEMPT_ID_ENV)) {
294          appAttemptID = ConverterUtils.toApplicationAttemptId(envs
295              .get(ApplicationConstants.AM_APP_ATTEMPT_ID_ENV));
296        } else if (!envs.containsKey(ApplicationConstants.AM_CONTAINER_ID_ENV)) {
297          if (cliParser.hasOption("app_attempt_id")) {
298            String appIdStr = cliParser.getOptionValue("app_attempt_id", "");
299            appAttemptID = ConverterUtils.toApplicationAttemptId(appIdStr);
300          } 
301          else {
302            throw new IllegalArgumentException("Application Attempt Id not set in the environment");
303          }
304        } else {
305          ContainerId containerId = ConverterUtils.toContainerId(envs.get(ApplicationConstants.AM_CONTAINER_ID_ENV));
306          appAttemptID = containerId.getApplicationAttemptId();
307        }
308    
309        LOG.info("Application master for app"
310            + ", appId=" + appAttemptID.getApplicationId().getId()
311            + ", clustertimestamp=" + appAttemptID.getApplicationId().getClusterTimestamp()
312            + ", attemptId=" + appAttemptID.getAttemptId());
313    
314        if (!cliParser.hasOption("shell_command")) {
315          throw new IllegalArgumentException("No shell command specified to be executed by application master");
316        }
317        shellCommand = cliParser.getOptionValue("shell_command");
318    
319        if (cliParser.hasOption("shell_args")) {
320          shellArgs = cliParser.getOptionValue("shell_args");
321        }
322        if (cliParser.hasOption("shell_env")) { 
323          String shellEnvs[] = cliParser.getOptionValues("shell_env");
324          for (String env : shellEnvs) {
325            env = env.trim();
326            int index = env.indexOf('=');
327            if (index == -1) {
328              shellEnv.put(env, "");
329              continue;
330            }
331            String key = env.substring(0, index);
332            String val = "";
333            if (index < (env.length()-1)) {
334              val = env.substring(index+1);
335            }
336            shellEnv.put(key, val);
337          }
338        }
339    
340        if (envs.containsKey(DSConstants.DISTRIBUTEDSHELLSCRIPTLOCATION)) {
341          shellScriptPath = envs.get(DSConstants.DISTRIBUTEDSHELLSCRIPTLOCATION);
342    
343          if (envs.containsKey(DSConstants.DISTRIBUTEDSHELLSCRIPTTIMESTAMP)) {
344            shellScriptPathTimestamp = Long.valueOf(envs.get(DSConstants.DISTRIBUTEDSHELLSCRIPTTIMESTAMP));
345          }
346          if (envs.containsKey(DSConstants.DISTRIBUTEDSHELLSCRIPTLEN)) {
347            shellScriptPathLen = Long.valueOf(envs.get(DSConstants.DISTRIBUTEDSHELLSCRIPTLEN));
348          }
349    
350          if (!shellScriptPath.isEmpty()
351              && (shellScriptPathTimestamp <= 0 
352              || shellScriptPathLen <= 0)) {
353            LOG.error("Illegal values in env for shell script path"
354                + ", path=" + shellScriptPath
355                + ", len=" + shellScriptPathLen
356                + ", timestamp=" + shellScriptPathTimestamp);
357            throw new IllegalArgumentException("Illegal values in env for shell script path");
358          }
359        }
360    
361        containerMemory = Integer.parseInt(cliParser.getOptionValue("container_memory", "10"));
362        numTotalContainers = Integer.parseInt(cliParser.getOptionValue("num_containers", "1"));
363        requestPriority = Integer.parseInt(cliParser.getOptionValue("priority", "0"));
364    
365        return true;
366      }
367    
368      /**
369       * Helper function to print usage 
370       * @param opts Parsed command line options
371       */
372      private void printUsage(Options opts) {
373        new HelpFormatter().printHelp("ApplicationMaster", opts);
374      }
375    
376      /**
377       * Main run function for the application master
378       * @throws YarnRemoteException
379       */
380      public boolean run() throws YarnRemoteException {
381        LOG.info("Starting ApplicationMaster");
382    
383        // Connect to ResourceManager
384        resourceManager = connectToRM();
385    
386        // Setup local RPC Server to accept status requests directly from clients 
387        // TODO need to setup a protocol for client to be able to communicate to the RPC server 
388        // TODO use the rpc port info to register with the RM for the client to send requests to this app master
389    
390        // Register self with ResourceManager 
391        RegisterApplicationMasterResponse response = registerToRM();
392        // Dump out information about cluster capability as seen by the resource manager
393        int minMem = response.getMinimumResourceCapability().getMemory();
394        int maxMem = response.getMaximumResourceCapability().getMemory();
395        LOG.info("Min mem capabililty of resources in this cluster " + minMem);
396        LOG.info("Max mem capabililty of resources in this cluster " + maxMem);
397    
398        // A resource ask has to be atleast the minimum of the capability of the cluster, the value has to be 
399        // a multiple of the min value and cannot exceed the max. 
400        // If it is not an exact multiple of min, the RM will allocate to the nearest multiple of min
401        if (containerMemory < minMem) {
402          LOG.info("Container memory specified below min threshold of cluster. Using min value."
403              + ", specified=" + containerMemory
404              + ", min=" + minMem);
405          containerMemory = minMem; 
406        } 
407        else if (containerMemory > maxMem) {
408          LOG.info("Container memory specified above max threshold of cluster. Using max value."
409              + ", specified=" + containerMemory
410              + ", max=" + maxMem);
411          containerMemory = maxMem;
412        }
413    
414        // Setup heartbeat emitter
415        // TODO poll RM every now and then with an empty request to let RM know that we are alive
416        // The heartbeat interval after which an AM is timed out by the RM is defined by a config setting: 
417        // RM_AM_EXPIRY_INTERVAL_MS with default defined by DEFAULT_RM_AM_EXPIRY_INTERVAL_MS
418        // The allocate calls to the RM count as heartbeats so, for now, this additional heartbeat emitter 
419        // is not required.
420    
421        // Setup ask for containers from RM
422        // Send request for containers to RM
423        // Until we get our fully allocated quota, we keep on polling RM for containers
424        // Keep looping until all the containers are launched and shell script executed on them 
425        // ( regardless of success/failure). 
426    
427        int loopCounter = -1;
428    
429        while (numCompletedContainers.get() < numTotalContainers
430            && !appDone) {
431          loopCounter++;
432    
433          // log current state
434          LOG.info("Current application state: loop=" + loopCounter 
435              + ", appDone=" + appDone
436              + ", total=" + numTotalContainers
437              + ", requested=" + numRequestedContainers
438              + ", completed=" + numCompletedContainers
439              + ", failed=" + numFailedContainers
440              + ", currentAllocated=" + numAllocatedContainers);
441    
442          // Sleep before each loop when asking RM for containers
443          // to avoid flooding RM with spurious requests when it 
444          // need not have any available containers 
445          // Sleeping for 1000 ms.
446          try {
447            Thread.sleep(1000);
448          } catch (InterruptedException e) {
449            LOG.info("Sleep interrupted " + e.getMessage());
450          }
451    
452          // No. of containers to request 
453          // For the first loop, askCount will be equal to total containers needed 
454          // From that point on, askCount will always be 0 as current implementation 
455          // does not change its ask on container failures. 
456          int askCount = numTotalContainers - numRequestedContainers.get();
457          numRequestedContainers.addAndGet(askCount);
458    
459          // Setup request to be sent to RM to allocate containers
460          List<ResourceRequest> resourceReq = new ArrayList<ResourceRequest>();
461          if (askCount > 0) {
462            ResourceRequest containerAsk = setupContainerAskForRM(askCount);
463            resourceReq.add(containerAsk);
464          }
465    
466          // Send the request to RM 
467          LOG.info("Asking RM for containers"
468              + ", askCount=" + askCount);
469          AMResponse amResp =sendContainerAskToRM(resourceReq);
470    
471          // Retrieve list of allocated containers from the response 
472          List<Container> allocatedContainers = amResp.getAllocatedContainers();
473          LOG.info("Got response from RM for container ask, allocatedCnt=" + allocatedContainers.size());
474          numAllocatedContainers.addAndGet(allocatedContainers.size());
475          for (Container allocatedContainer : allocatedContainers) {
476            LOG.info("Launching shell command on a new container."
477                + ", containerId=" + allocatedContainer.getId()
478                + ", containerNode=" + allocatedContainer.getNodeId().getHost() 
479                + ":" + allocatedContainer.getNodeId().getPort()
480                + ", containerNodeURI=" + allocatedContainer.getNodeHttpAddress()
481                + ", containerState" + allocatedContainer.getState()
482                + ", containerResourceMemory" + allocatedContainer.getResource().getMemory());
483            //+ ", containerToken" + allocatedContainer.getContainerToken().getIdentifier().toString());
484    
485            LaunchContainerRunnable runnableLaunchContainer = new LaunchContainerRunnable(allocatedContainer);
486            Thread launchThread = new Thread(runnableLaunchContainer);
487    
488            // launch and start the container on a separate thread to keep the main thread unblocked
489            // as all containers may not be allocated at one go.
490            launchThreads.add(launchThread);
491            launchThread.start();
492          }
493    
494          // Check what the current available resources in the cluster are
495          // TODO should we do anything if the available resources are not enough? 
496          Resource availableResources = amResp.getAvailableResources();
497          LOG.info("Current available resources in the cluster " + availableResources);
498    
499          // Check the completed containers
500          List<ContainerStatus> completedContainers = amResp.getCompletedContainersStatuses();
501          LOG.info("Got response from RM for container ask, completedCnt=" + completedContainers.size());
502          for (ContainerStatus containerStatus : completedContainers) {
503            LOG.info("Got container status for containerID= " + containerStatus.getContainerId()
504                + ", state=" + containerStatus.getState()
505                + ", exitStatus=" + containerStatus.getExitStatus() 
506                + ", diagnostics=" + containerStatus.getDiagnostics());
507    
508            // non complete containers should not be here 
509            assert(containerStatus.getState() == ContainerState.COMPLETE);
510    
511            // increment counters for completed/failed containers
512            int exitStatus = containerStatus.getExitStatus();
513            if (0 != exitStatus) {
514              // container failed 
515              if (-100 != exitStatus) {
516                // shell script failed
517                // counts as completed 
518                numCompletedContainers.incrementAndGet();
519                numFailedContainers.incrementAndGet();
520              }
521              else { 
522                // something else bad happened 
523                // app job did not complete for some reason 
524                // we should re-try as the container was lost for some reason
525                numAllocatedContainers.decrementAndGet();
526                numRequestedContainers.decrementAndGet();
527                // we do not need to release the container as it would be done
528                // by the RM/CM.
529              }
530            }
531            else { 
532              // nothing to do 
533              // container completed successfully 
534              numCompletedContainers.incrementAndGet();
535              LOG.info("Container completed successfully."
536                  + ", containerId=" + containerStatus.getContainerId());
537            }
538    
539          }
540          if (numCompletedContainers.get() == numTotalContainers) {
541            appDone = true;
542          }
543    
544          LOG.info("Current application state: loop=" + loopCounter
545              + ", appDone=" + appDone
546              + ", total=" + numTotalContainers
547              + ", requested=" + numRequestedContainers
548              + ", completed=" + numCompletedContainers
549              + ", failed=" + numFailedContainers
550              + ", currentAllocated=" + numAllocatedContainers);
551    
552          // TODO 
553          // Add a timeout handling layer 
554          // for misbehaving shell commands
555        }
556    
557        // Join all launched threads
558        // needed for when we time out 
559        // and we need to release containers
560        for (Thread launchThread : launchThreads) {
561          try {
562            launchThread.join(10000);
563          } catch (InterruptedException e) {
564            LOG.info("Exception thrown in thread join: " + e.getMessage());
565            e.printStackTrace();
566          }
567        }
568    
569        // When the application completes, it should send a finish application signal 
570        // to the RM
571        LOG.info("Application completed. Signalling finish to RM");
572    
573        FinishApplicationMasterRequest finishReq = Records.newRecord(FinishApplicationMasterRequest.class);
574        finishReq.setAppAttemptId(appAttemptID);
575        boolean isSuccess = true;
576        if (numFailedContainers.get() == 0) {
577          finishReq.setFinishApplicationStatus(FinalApplicationStatus.SUCCEEDED);
578        }
579        else {
580          finishReq.setFinishApplicationStatus(FinalApplicationStatus.FAILED);
581          String diagnostics = "Diagnostics."
582              + ", total=" + numTotalContainers
583              + ", completed=" + numCompletedContainers.get()
584              + ", allocated=" + numAllocatedContainers.get()
585              + ", failed=" + numFailedContainers.get();
586          finishReq.setDiagnostics(diagnostics);
587          isSuccess = false;
588        }
589        resourceManager.finishApplicationMaster(finishReq);
590        return isSuccess;
591      }
592    
593      /**
594       * Thread to connect to the {@link ContainerManager} and 
595       * launch the container that will execute the shell command. 
596       */
597      private class LaunchContainerRunnable implements Runnable {
598    
599        // Allocated container 
600        Container container;
601        // Handle to communicate with ContainerManager
602        ContainerManager cm;
603    
604        /**
605         * @param lcontainer Allocated container
606         */
607        public LaunchContainerRunnable(Container lcontainer) {
608          this.container = lcontainer;
609        }
610    
611        /**
612         * Helper function to connect to CM
613         */
614        private void connectToCM() {
615          LOG.debug("Connecting to ContainerManager for containerid=" + container.getId());
616          String cmIpPortStr = container.getNodeId().getHost() + ":"
617              + container.getNodeId().getPort();
618          InetSocketAddress cmAddress = NetUtils.createSocketAddr(cmIpPortStr);
619          LOG.info("Connecting to ContainerManager at " + cmIpPortStr);
620          this.cm = ((ContainerManager) rpc.getProxy(ContainerManager.class, cmAddress, conf));
621        }
622    
623    
624        @Override
625        /**
626         * Connects to CM, sets up container launch context 
627         * for shell command and eventually dispatches the container 
628         * start request to the CM. 
629         */
630        public void run() {
631          // Connect to ContainerManager 
632          connectToCM();
633    
634          LOG.info("Setting up container launch container for containerid=" + container.getId());
635          ContainerLaunchContext ctx = Records.newRecord(ContainerLaunchContext.class);
636    
637          ctx.setContainerId(container.getId());
638          ctx.setResource(container.getResource());
639    
640          String jobUserName = System.getenv(ApplicationConstants.Environment.USER
641              .name());
642          ctx.setUser(jobUserName);
643          LOG.info("Setting user in ContainerLaunchContext to: " + jobUserName);
644    
645          // Set the environment 
646          ctx.setEnvironment(shellEnv);
647    
648          // Set the local resources 
649          Map<String, LocalResource> localResources = new HashMap<String, LocalResource>();
650    
651          // The container for the eventual shell commands needs its own local resources too. 
652          // In this scenario, if a shell script is specified, we need to have it copied 
653          // and made available to the container. 
654          if (!shellScriptPath.isEmpty()) {
655            LocalResource shellRsrc = Records.newRecord(LocalResource.class);
656            shellRsrc.setType(LocalResourceType.FILE);
657            shellRsrc.setVisibility(LocalResourceVisibility.APPLICATION);
658            try {
659              shellRsrc.setResource(ConverterUtils.getYarnUrlFromURI(new URI(shellScriptPath)));
660            } catch (URISyntaxException e) {
661              LOG.error("Error when trying to use shell script path specified in env"
662                  + ", path=" + shellScriptPath);
663              e.printStackTrace();
664    
665              // A failure scenario on bad input such as invalid shell script path 
666              // We know we cannot continue launching the container 
667              // so we should release it.
668              // TODO
669              numCompletedContainers.incrementAndGet();
670              numFailedContainers.incrementAndGet();
671              return;
672            }
673            shellRsrc.setTimestamp(shellScriptPathTimestamp);
674            shellRsrc.setSize(shellScriptPathLen);
675            localResources.put(ExecShellStringPath, shellRsrc);
676          }
677          ctx.setLocalResources(localResources);
678    
679          // Set the necessary command to execute on the allocated container 
680          Vector<CharSequence> vargs = new Vector<CharSequence>(5);
681    
682          // Set executable command 
683          vargs.add(shellCommand);
684          // Set shell script path 
685          if (!shellScriptPath.isEmpty()) {
686            vargs.add(ExecShellStringPath);
687          }
688    
689          // Set args for the shell command if any
690          vargs.add(shellArgs);
691          // Add log redirect params
692          // TODO
693          // We should redirect the output to hdfs instead of local logs 
694          // so as to be able to look at the final output after the containers 
695          // have been released. 
696          // Could use a path suffixed with /AppId/AppAttempId/ContainerId/std[out|err] 
697          vargs.add("1>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stdout");
698          vargs.add("2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stderr");
699    
700          // Get final commmand
701          StringBuilder command = new StringBuilder();
702          for (CharSequence str : vargs) {
703            command.append(str).append(" ");
704          }
705    
706          List<String> commands = new ArrayList<String>();
707          commands.add(command.toString());
708          ctx.setCommands(commands);
709    
710          StartContainerRequest startReq = Records.newRecord(StartContainerRequest.class);
711          startReq.setContainerLaunchContext(ctx);
712          try {
713            cm.startContainer(startReq);
714          } catch (YarnRemoteException e) {
715            LOG.info("Start container failed for :"
716                + ", containerId=" + container.getId());
717            e.printStackTrace();
718            // TODO do we need to release this container? 
719          }
720    
721          // Get container status?
722          // Left commented out as the shell scripts are short lived 
723          // and we are relying on the status for completed containers from RM to detect status
724    
725          //    GetContainerStatusRequest statusReq = Records.newRecord(GetContainerStatusRequest.class);
726          //    statusReq.setContainerId(container.getId());
727          //    GetContainerStatusResponse statusResp;
728          //try {
729          //statusResp = cm.getContainerStatus(statusReq);
730          //    LOG.info("Container Status"
731          //    + ", id=" + container.getId()
732          //    + ", status=" +statusResp.getStatus());
733          //} catch (YarnRemoteException e) {
734          //e.printStackTrace();
735          //}
736        }
737      }
738    
739      /**
740       * Connect to the Resource Manager
741       * @return Handle to communicate with the RM
742       */
743      private AMRMProtocol connectToRM() {
744        YarnConfiguration yarnConf = new YarnConfiguration(conf);
745        InetSocketAddress rmAddress = yarnConf.getSocketAddr(
746            YarnConfiguration.RM_SCHEDULER_ADDRESS,
747            YarnConfiguration.DEFAULT_RM_SCHEDULER_ADDRESS,
748            YarnConfiguration.DEFAULT_RM_SCHEDULER_PORT);
749        LOG.info("Connecting to ResourceManager at " + rmAddress);
750        return ((AMRMProtocol) rpc.getProxy(AMRMProtocol.class, rmAddress, conf));
751      }
752    
753      /** 
754       * Register the Application Master to the Resource Manager
755       * @return the registration response from the RM
756       * @throws YarnRemoteException
757       */
758      private RegisterApplicationMasterResponse registerToRM() throws YarnRemoteException {
759        RegisterApplicationMasterRequest appMasterRequest = Records.newRecord(RegisterApplicationMasterRequest.class);
760    
761        // set the required info into the registration request: 
762        // application attempt id, 
763        // host on which the app master is running
764        // rpc port on which the app master accepts requests from the client 
765        // tracking url for the app master
766        appMasterRequest.setApplicationAttemptId(appAttemptID);
767        appMasterRequest.setHost(appMasterHostname);
768        appMasterRequest.setRpcPort(appMasterRpcPort);
769        appMasterRequest.setTrackingUrl(appMasterTrackingUrl);
770    
771        return resourceManager.registerApplicationMaster(appMasterRequest);
772      }
773    
774      /**
775       * Setup the request that will be sent to the RM for the container ask.
776       * @param numContainers Containers to ask for from RM
777       * @return the setup ResourceRequest to be sent to RM
778       */
779      private ResourceRequest setupContainerAskForRM(int numContainers) {
780        ResourceRequest request = Records.newRecord(ResourceRequest.class);
781    
782        // setup requirements for hosts 
783        // whether a particular rack/host is needed 
784        // Refer to apis under org.apache.hadoop.net for more 
785        // details on how to get figure out rack/host mapping.
786        // using * as any host will do for the distributed shell app
787        request.setHostName("*");
788    
789        // set no. of containers needed
790        request.setNumContainers(numContainers);
791    
792        // set the priority for the request
793        Priority pri = Records.newRecord(Priority.class);
794        // TODO - what is the range for priority? how to decide? 
795        pri.setPriority(requestPriority);
796        request.setPriority(pri);
797    
798        // Set up resource type requirements
799        // For now, only memory is supported so we set memory requirements
800        Resource capability = Records.newRecord(Resource.class);
801        capability.setMemory(containerMemory);
802        request.setCapability(capability);
803    
804        return request;
805      }
806    
807      /**
808       * Ask RM to allocate given no. of containers to this Application Master
809       * @param requestedContainers Containers to ask for from RM
810       * @return Response from RM to AM with allocated containers 
811       * @throws YarnRemoteException
812       */
813      private AMResponse sendContainerAskToRM(List<ResourceRequest> requestedContainers)
814          throws YarnRemoteException {
815        AllocateRequest req = Records.newRecord(AllocateRequest.class);
816        req.setResponseId(rmRequestID.incrementAndGet());
817        req.setApplicationAttemptId(appAttemptID);
818        req.addAllAsks(requestedContainers);
819        req.addAllReleases(releasedContainers);
820        req.setProgress((float)numCompletedContainers.get()/numTotalContainers);
821    
822        LOG.info("Sending request to RM for containers"
823            + ", requestedSet=" + requestedContainers.size()
824            + ", releasedSet=" + releasedContainers.size()
825            + ", progress=" + req.getProgress());
826    
827        for (ResourceRequest  rsrcReq : requestedContainers) {
828          LOG.info("Requested container ask: " + rsrcReq.toString());
829        }
830        for (ContainerId id : releasedContainers) {
831          LOG.info("Released container, id=" + id.getId());
832        }
833    
834        AllocateResponse resp = resourceManager.allocate(req);
835        return resp.getAMResponse();
836      }
837    }