001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    
019    package org.apache.hadoop.yarn.applications.distributedshell;
020    
021    import java.io.BufferedReader;
022    import java.io.IOException;
023    import java.io.InputStreamReader;
024    import java.net.InetSocketAddress;
025    import java.net.URI;
026    import java.net.URISyntaxException;
027    import java.util.ArrayList;
028    import java.util.HashMap;
029    import java.util.List;
030    import java.util.Map;
031    import java.util.Vector;
032    import java.util.concurrent.atomic.AtomicInteger;
033    
034    import org.apache.commons.cli.CommandLine;
035    import org.apache.commons.cli.GnuParser;
036    import org.apache.commons.cli.HelpFormatter;
037    import org.apache.commons.cli.Options;
038    import org.apache.commons.cli.ParseException;
039    import org.apache.commons.logging.Log;
040    import org.apache.commons.logging.LogFactory;
041    
042    import org.apache.hadoop.classification.InterfaceAudience;
043    import org.apache.hadoop.classification.InterfaceStability;
044    import org.apache.hadoop.conf.Configuration;
045    import org.apache.hadoop.net.NetUtils;
046    import org.apache.hadoop.yarn.api.AMRMProtocol;
047    import org.apache.hadoop.yarn.api.ApplicationConstants;
048    import org.apache.hadoop.yarn.api.ContainerManager;
049    
050    import org.apache.hadoop.yarn.api.protocolrecords.AllocateRequest;
051    import org.apache.hadoop.yarn.api.protocolrecords.AllocateResponse;
052    import org.apache.hadoop.yarn.api.protocolrecords.FinishApplicationMasterRequest;
053    import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterResponse;
054    import org.apache.hadoop.yarn.api.protocolrecords.StartContainerRequest;
055    
056    import org.apache.hadoop.yarn.api.records.AMResponse;
057    import org.apache.hadoop.yarn.api.records.ApplicationAttemptId;
058    import org.apache.hadoop.yarn.api.records.Container;
059    import org.apache.hadoop.yarn.api.records.ContainerId;
060    import org.apache.hadoop.yarn.api.records.ContainerLaunchContext;
061    import org.apache.hadoop.yarn.api.records.ContainerState;
062    import org.apache.hadoop.yarn.api.records.ContainerStatus;
063    import org.apache.hadoop.yarn.api.records.FinalApplicationStatus;
064    import org.apache.hadoop.yarn.api.records.LocalResource;
065    import org.apache.hadoop.yarn.api.records.LocalResourceType;
066    import org.apache.hadoop.yarn.api.records.LocalResourceVisibility;
067    import org.apache.hadoop.yarn.api.records.Priority;
068    import org.apache.hadoop.yarn.api.records.Resource;
069    import org.apache.hadoop.yarn.api.records.ResourceRequest;
070    import org.apache.hadoop.yarn.client.AMRMClient;
071    import org.apache.hadoop.yarn.client.AMRMClient.ContainerRequest;
072    import org.apache.hadoop.yarn.client.AMRMClientImpl;
073    import org.apache.hadoop.yarn.conf.YarnConfiguration;
074    import org.apache.hadoop.yarn.exceptions.YarnRemoteException;
075    import org.apache.hadoop.yarn.ipc.YarnRPC;
076    import org.apache.hadoop.yarn.util.ConverterUtils;
077    import org.apache.hadoop.yarn.util.Records;
078    
079    /**
080     * An ApplicationMaster for executing shell commands on a set of launched
081     * containers using the YARN framework.
082     * 
083     * <p>
084     * This class is meant to act as an example on how to write yarn-based
085     * application masters.
086     * </p>
087     * 
088     * <p>
089     * The ApplicationMaster is started on a container by the
090     * <code>ResourceManager</code>'s launcher. The first thing that the
091     * <code>ApplicationMaster</code> needs to do is to connect and register itself
092     * with the <code>ResourceManager</code>. The registration sets up information
093     * within the <code>ResourceManager</code> regarding what host:port the
094     * ApplicationMaster is listening on to provide any form of functionality to a
095     * client as well as a tracking url that a client can use to keep track of
096     * status/job history if needed.
097     * </p>
098     * 
099     * <p>
100     * The <code>ApplicationMaster</code> needs to send a heartbeat to the
101     * <code>ResourceManager</code> at regular intervals to inform the
102     * <code>ResourceManager</code> that it is up and alive. The
103     * {@link AMRMProtocol#allocate} to the <code>ResourceManager</code> from the
104     * <code>ApplicationMaster</code> acts as a heartbeat.
105     * 
106     * <p>
107     * For the actual handling of the job, the <code>ApplicationMaster</code> has to
108     * request the <code>ResourceManager</code> via {@link AllocateRequest} for the
109     * required no. of containers using {@link ResourceRequest} with the necessary
110     * resource specifications such as node location, computational
111     * (memory/disk/cpu) resource requirements. The <code>ResourceManager</code>
112     * responds with an {@link AllocateResponse} that informs the
113     * <code>ApplicationMaster</code> of the set of newly allocated containers,
114     * completed containers as well as current state of available resources.
115     * </p>
116     * 
117     * <p>
118     * For each allocated container, the <code>ApplicationMaster</code> can then set
119     * up the necessary launch context via {@link ContainerLaunchContext} to specify
120     * the allocated container id, local resources required by the executable, the
121     * environment to be setup for the executable, commands to execute, etc. and
122     * submit a {@link StartContainerRequest} to the {@link ContainerManager} to
123     * launch and execute the defined commands on the given allocated container.
124     * </p>
125     * 
126     * <p>
127     * The <code>ApplicationMaster</code> can monitor the launched container by
128     * either querying the <code>ResourceManager</code> using
129     * {@link AMRMProtocol#allocate} to get updates on completed containers or via
130     * the {@link ContainerManager} by querying for the status of the allocated
131     * container's {@link ContainerId}.
132     *
133     * <p>
134     * After the job has been completed, the <code>ApplicationMaster</code> has to
135     * send a {@link FinishApplicationMasterRequest} to the
136     * <code>ResourceManager</code> to inform it that the
137     * <code>ApplicationMaster</code> has been completed.
138     */
139    @InterfaceAudience.Public
140    @InterfaceStability.Unstable
141    public class ApplicationMaster {
142    
143      private static final Log LOG = LogFactory.getLog(ApplicationMaster.class);
144    
145      // Configuration
146      private Configuration conf;
147      // YARN RPC to communicate with the Resource Manager or Node Manager
148      private YarnRPC rpc;
149    
150      // Handle to communicate with the Resource Manager
151      private AMRMClient resourceManager;
152    
153      // Application Attempt Id ( combination of attemptId and fail count )
154      private ApplicationAttemptId appAttemptID;
155    
156      // TODO
157      // For status update for clients - yet to be implemented
158      // Hostname of the container
159      private String appMasterHostname = "";
160      // Port on which the app master listens for status updates from clients
161      private int appMasterRpcPort = 0;
162      // Tracking url to which app master publishes info for clients to monitor
163      private String appMasterTrackingUrl = "";
164    
165      // App Master configuration
166      // No. of containers to run shell command on
167      private int numTotalContainers = 1;
168      // Memory to request for the container on which the shell command will run
169      private int containerMemory = 10;
170      // Priority of the request
171      private int requestPriority;
172    
173      // Simple flag to denote whether all works is done
174      private boolean appDone = false;
175      // Counter for completed containers ( complete denotes successful or failed )
176      private AtomicInteger numCompletedContainers = new AtomicInteger();
177      // Allocated container count so that we know how many containers has the RM
178      // allocated to us
179      private AtomicInteger numAllocatedContainers = new AtomicInteger();
180      // Count of failed containers
181      private AtomicInteger numFailedContainers = new AtomicInteger();
182      // Count of containers already requested from the RM
183      // Needed as once requested, we should not request for containers again.
184      // Only request for more if the original requirement changes.
185      private AtomicInteger numRequestedContainers = new AtomicInteger();
186    
187      // Shell command to be executed
188      private String shellCommand = "";
189      // Args to be passed to the shell command
190      private String shellArgs = "";
191      // Env variables to be setup for the shell command
192      private Map<String, String> shellEnv = new HashMap<String, String>();
193    
194      // Location of shell script ( obtained from info set in env )
195      // Shell script path in fs
196      private String shellScriptPath = "";
197      // Timestamp needed for creating a local resource
198      private long shellScriptPathTimestamp = 0;
199      // File length needed for local resource
200      private long shellScriptPathLen = 0;
201    
202      // Hardcoded path to shell script in launch container's local env
203      private final String ExecShellStringPath = "ExecShellScript.sh";
204    
205      // Launch threads
206      private List<Thread> launchThreads = new ArrayList<Thread>();
207    
208      /**
209       * @param args Command line args
210       */
211      public static void main(String[] args) {
212        boolean result = false;
213        try {
214          ApplicationMaster appMaster = new ApplicationMaster();
215          LOG.info("Initializing ApplicationMaster");
216          boolean doRun = appMaster.init(args);
217          if (!doRun) {
218            System.exit(0);
219          }
220          result = appMaster.run();
221        } catch (Throwable t) {
222          LOG.fatal("Error running ApplicationMaster", t);
223          System.exit(1);
224        }
225        if (result) {
226          LOG.info("Application Master completed successfully. exiting");
227          System.exit(0);
228        } else {
229          LOG.info("Application Master failed. exiting");
230          System.exit(2);
231        }
232      }
233    
234      /**
235       * Dump out contents of $CWD and the environment to stdout for debugging
236       */
237      private void dumpOutDebugInfo() {
238    
239        LOG.info("Dump debug output");
240        Map<String, String> envs = System.getenv();
241        for (Map.Entry<String, String> env : envs.entrySet()) {
242          LOG.info("System env: key=" + env.getKey() + ", val=" + env.getValue());
243          System.out.println("System env: key=" + env.getKey() + ", val="
244              + env.getValue());
245        }
246    
247        String cmd = "ls -al";
248        Runtime run = Runtime.getRuntime();
249        Process pr = null;
250        try {
251          pr = run.exec(cmd);
252          pr.waitFor();
253    
254          BufferedReader buf = new BufferedReader(new InputStreamReader(
255              pr.getInputStream()));
256          String line = "";
257          while ((line = buf.readLine()) != null) {
258            LOG.info("System CWD content: " + line);
259            System.out.println("System CWD content: " + line);
260          }
261          buf.close();
262        } catch (IOException e) {
263          e.printStackTrace();
264        } catch (InterruptedException e) {
265          e.printStackTrace();
266        }
267      }
268    
269      public ApplicationMaster() throws Exception {
270        // Set up the configuration and RPC
271        conf = new YarnConfiguration();
272        rpc = YarnRPC.create(conf);
273      }
274    
275      /**
276       * Parse command line options
277       *
278       * @param args Command line args
279       * @return Whether init successful and run should be invoked
280       * @throws ParseException
281       * @throws IOException
282       */
283      public boolean init(String[] args) throws ParseException, IOException {
284    
285        Options opts = new Options();
286        opts.addOption("app_attempt_id", true,
287            "App Attempt ID. Not to be used unless for testing purposes");
288        opts.addOption("shell_command", true,
289            "Shell command to be executed by the Application Master");
290        opts.addOption("shell_script", true,
291            "Location of the shell script to be executed");
292        opts.addOption("shell_args", true, "Command line args for the shell script");
293        opts.addOption("shell_env", true,
294            "Environment for shell script. Specified as env_key=env_val pairs");
295        opts.addOption("container_memory", true,
296            "Amount of memory in MB to be requested to run the shell command");
297        opts.addOption("num_containers", true,
298            "No. of containers on which the shell command needs to be executed");
299        opts.addOption("priority", true, "Application Priority. Default 0");
300        opts.addOption("debug", false, "Dump out debug information");
301    
302        opts.addOption("help", false, "Print usage");
303        CommandLine cliParser = new GnuParser().parse(opts, args);
304    
305        if (args.length == 0) {
306          printUsage(opts);
307          throw new IllegalArgumentException(
308              "No args specified for application master to initialize");
309        }
310    
311        if (cliParser.hasOption("help")) {
312          printUsage(opts);
313          return false;
314        }
315    
316        if (cliParser.hasOption("debug")) {
317          dumpOutDebugInfo();
318        }
319    
320        Map<String, String> envs = System.getenv();
321    
322        if (envs.containsKey(ApplicationConstants.AM_APP_ATTEMPT_ID_ENV)) {
323          appAttemptID = ConverterUtils.toApplicationAttemptId(envs
324              .get(ApplicationConstants.AM_APP_ATTEMPT_ID_ENV));
325        } else if (!envs.containsKey(ApplicationConstants.AM_CONTAINER_ID_ENV)) {
326          if (cliParser.hasOption("app_attempt_id")) {
327            String appIdStr = cliParser.getOptionValue("app_attempt_id", "");
328            appAttemptID = ConverterUtils.toApplicationAttemptId(appIdStr);
329          } else {
330            throw new IllegalArgumentException(
331                "Application Attempt Id not set in the environment");
332          }
333        } else {
334          ContainerId containerId = ConverterUtils.toContainerId(envs
335              .get(ApplicationConstants.AM_CONTAINER_ID_ENV));
336          appAttemptID = containerId.getApplicationAttemptId();
337        }
338    
339        LOG.info("Application master for app" + ", appId="
340            + appAttemptID.getApplicationId().getId() + ", clustertimestamp="
341            + appAttemptID.getApplicationId().getClusterTimestamp()
342            + ", attemptId=" + appAttemptID.getAttemptId());
343    
344        if (!cliParser.hasOption("shell_command")) {
345          throw new IllegalArgumentException(
346              "No shell command specified to be executed by application master");
347        }
348        shellCommand = cliParser.getOptionValue("shell_command");
349    
350        if (cliParser.hasOption("shell_args")) {
351          shellArgs = cliParser.getOptionValue("shell_args");
352        }
353        if (cliParser.hasOption("shell_env")) {
354          String shellEnvs[] = cliParser.getOptionValues("shell_env");
355          for (String env : shellEnvs) {
356            env = env.trim();
357            int index = env.indexOf('=');
358            if (index == -1) {
359              shellEnv.put(env, "");
360              continue;
361            }
362            String key = env.substring(0, index);
363            String val = "";
364            if (index < (env.length() - 1)) {
365              val = env.substring(index + 1);
366            }
367            shellEnv.put(key, val);
368          }
369        }
370    
371        if (envs.containsKey(DSConstants.DISTRIBUTEDSHELLSCRIPTLOCATION)) {
372          shellScriptPath = envs.get(DSConstants.DISTRIBUTEDSHELLSCRIPTLOCATION);
373    
374          if (envs.containsKey(DSConstants.DISTRIBUTEDSHELLSCRIPTTIMESTAMP)) {
375            shellScriptPathTimestamp = Long.valueOf(envs
376                .get(DSConstants.DISTRIBUTEDSHELLSCRIPTTIMESTAMP));
377          }
378          if (envs.containsKey(DSConstants.DISTRIBUTEDSHELLSCRIPTLEN)) {
379            shellScriptPathLen = Long.valueOf(envs
380                .get(DSConstants.DISTRIBUTEDSHELLSCRIPTLEN));
381          }
382    
383          if (!shellScriptPath.isEmpty()
384              && (shellScriptPathTimestamp <= 0 || shellScriptPathLen <= 0)) {
385            LOG.error("Illegal values in env for shell script path" + ", path="
386                + shellScriptPath + ", len=" + shellScriptPathLen + ", timestamp="
387                + shellScriptPathTimestamp);
388            throw new IllegalArgumentException(
389                "Illegal values in env for shell script path");
390          }
391        }
392    
393        containerMemory = Integer.parseInt(cliParser.getOptionValue(
394            "container_memory", "10"));
395        numTotalContainers = Integer.parseInt(cliParser.getOptionValue(
396            "num_containers", "1"));
397        requestPriority = Integer.parseInt(cliParser
398            .getOptionValue("priority", "0"));
399    
400        return true;
401      }
402    
403      /**
404       * Helper function to print usage
405       *
406       * @param opts Parsed command line options
407       */
408      private void printUsage(Options opts) {
409        new HelpFormatter().printHelp("ApplicationMaster", opts);
410      }
411    
412      /**
413       * Main run function for the application master
414       *
415       * @throws YarnRemoteException
416       */
417      public boolean run() throws YarnRemoteException {
418        LOG.info("Starting ApplicationMaster");
419    
420        // Connect to ResourceManager
421        resourceManager = new AMRMClientImpl(appAttemptID);
422        resourceManager.init(conf);
423        resourceManager.start();
424    
425        try {
426          // Setup local RPC Server to accept status requests directly from clients
427          // TODO need to setup a protocol for client to be able to communicate to
428          // the RPC server
429          // TODO use the rpc port info to register with the RM for the client to
430          // send requests to this app master
431    
432          // Register self with ResourceManager
433          RegisterApplicationMasterResponse response = resourceManager
434              .registerApplicationMaster(appMasterHostname, appMasterRpcPort,
435                  appMasterTrackingUrl);
436          // Dump out information about cluster capability as seen by the
437          // resource manager
438          int minMem = response.getMinimumResourceCapability().getMemory();
439          int maxMem = response.getMaximumResourceCapability().getMemory();
440          LOG.info("Min mem capabililty of resources in this cluster " + minMem);
441          LOG.info("Max mem capabililty of resources in this cluster " + maxMem);
442    
443          // A resource ask has to be atleast the minimum of the capability of the
444          // cluster, the value has to be a multiple of the min value and cannot
445          // exceed the max.
446          // If it is not an exact multiple of min, the RM will allocate to the
447          // nearest multiple of min
448          if (containerMemory < minMem) {
449            LOG.info("Container memory specified below min threshold of cluster."
450                + " Using min value." + ", specified=" + containerMemory + ", min="
451                + minMem);
452            containerMemory = minMem;
453          } else if (containerMemory > maxMem) {
454            LOG.info("Container memory specified above max threshold of cluster."
455                + " Using max value." + ", specified=" + containerMemory + ", max="
456                + maxMem);
457            containerMemory = maxMem;
458          }
459    
460          // Setup heartbeat emitter
461          // TODO poll RM every now and then with an empty request to let RM know
462          // that we are alive
463          // The heartbeat interval after which an AM is timed out by the RM is
464          // defined by a config setting:
465          // RM_AM_EXPIRY_INTERVAL_MS with default defined by
466          // DEFAULT_RM_AM_EXPIRY_INTERVAL_MS
467          // The allocate calls to the RM count as heartbeats so, for now,
468          // this additional heartbeat emitter is not required.
469    
470          // Setup ask for containers from RM
471          // Send request for containers to RM
472          // Until we get our fully allocated quota, we keep on polling RM for
473          // containers
474          // Keep looping until all the containers are launched and shell script
475          // executed on them ( regardless of success/failure).
476    
477          int loopCounter = -1;
478    
479          while (numCompletedContainers.get() < numTotalContainers && !appDone) {
480            loopCounter++;
481    
482            // log current state
483            LOG.info("Current application state: loop=" + loopCounter
484                + ", appDone=" + appDone + ", total=" + numTotalContainers
485                + ", requested=" + numRequestedContainers + ", completed="
486                + numCompletedContainers + ", failed=" + numFailedContainers
487                + ", currentAllocated=" + numAllocatedContainers);
488    
489            // Sleep before each loop when asking RM for containers
490            // to avoid flooding RM with spurious requests when it
491            // need not have any available containers
492            // Sleeping for 1000 ms.
493            try {
494              Thread.sleep(1000);
495            } catch (InterruptedException e) {
496              LOG.info("Sleep interrupted " + e.getMessage());
497            }
498    
499            // No. of containers to request
500            // For the first loop, askCount will be equal to total containers needed
501            // From that point on, askCount will always be 0 as current
502            // implementation does not change its ask on container failures.
503            int askCount = numTotalContainers - numRequestedContainers.get();
504            numRequestedContainers.addAndGet(askCount);
505    
506            if (askCount > 0) {
507              ContainerRequest containerAsk = setupContainerAskForRM(askCount);
508              resourceManager.addContainerRequest(containerAsk);
509            }
510    
511            // Send the request to RM
512            LOG.info("Asking RM for containers" + ", askCount=" + askCount);
513            AMResponse amResp = sendContainerAskToRM();
514    
515            // Retrieve list of allocated containers from the response
516            List<Container> allocatedContainers = amResp.getAllocatedContainers();
517            LOG.info("Got response from RM for container ask, allocatedCnt="
518                + allocatedContainers.size());
519            numAllocatedContainers.addAndGet(allocatedContainers.size());
520            for (Container allocatedContainer : allocatedContainers) {
521              LOG.info("Launching shell command on a new container."
522                  + ", containerId=" + allocatedContainer.getId()
523                  + ", containerNode=" + allocatedContainer.getNodeId().getHost()
524                  + ":" + allocatedContainer.getNodeId().getPort()
525                  + ", containerNodeURI=" + allocatedContainer.getNodeHttpAddress()
526                  + ", containerState" + allocatedContainer.getState()
527                  + ", containerResourceMemory"
528                  + allocatedContainer.getResource().getMemory());
529              // + ", containerToken"
530              // +allocatedContainer.getContainerToken().getIdentifier().toString());
531    
532              LaunchContainerRunnable runnableLaunchContainer = new LaunchContainerRunnable(
533                  allocatedContainer);
534              Thread launchThread = new Thread(runnableLaunchContainer);
535    
536              // launch and start the container on a separate thread to keep
537              // the main thread unblocked
538              // as all containers may not be allocated at one go.
539              launchThreads.add(launchThread);
540              launchThread.start();
541            }
542    
543            // Check what the current available resources in the cluster are
544            // TODO should we do anything if the available resources are not enough?
545            Resource availableResources = amResp.getAvailableResources();
546            LOG.info("Current available resources in the cluster "
547                + availableResources);
548    
549            // Check the completed containers
550            List<ContainerStatus> completedContainers = amResp
551                .getCompletedContainersStatuses();
552            LOG.info("Got response from RM for container ask, completedCnt="
553                + completedContainers.size());
554            for (ContainerStatus containerStatus : completedContainers) {
555              LOG.info("Got container status for containerID="
556                  + containerStatus.getContainerId() + ", state="
557                  + containerStatus.getState() + ", exitStatus="
558                  + containerStatus.getExitStatus() + ", diagnostics="
559                  + containerStatus.getDiagnostics());
560    
561              // non complete containers should not be here
562              assert (containerStatus.getState() == ContainerState.COMPLETE);
563    
564              // increment counters for completed/failed containers
565              int exitStatus = containerStatus.getExitStatus();
566              if (0 != exitStatus) {
567                // container failed
568                if (-100 != exitStatus) {
569                  // shell script failed
570                  // counts as completed
571                  numCompletedContainers.incrementAndGet();
572                  numFailedContainers.incrementAndGet();
573                } else {
574                  // something else bad happened
575                  // app job did not complete for some reason
576                  // we should re-try as the container was lost for some reason
577                  numAllocatedContainers.decrementAndGet();
578                  numRequestedContainers.decrementAndGet();
579                  // we do not need to release the container as it would be done
580                  // by the RM/CM.
581                }
582              } else {
583                // nothing to do
584                // container completed successfully
585                numCompletedContainers.incrementAndGet();
586                LOG.info("Container completed successfully." + ", containerId="
587                    + containerStatus.getContainerId());
588              }
589            }
590            if (numCompletedContainers.get() == numTotalContainers) {
591              appDone = true;
592            }
593    
594            LOG.info("Current application state: loop=" + loopCounter
595                + ", appDone=" + appDone + ", total=" + numTotalContainers
596                + ", requested=" + numRequestedContainers + ", completed="
597                + numCompletedContainers + ", failed=" + numFailedContainers
598                + ", currentAllocated=" + numAllocatedContainers);
599    
600            // TODO
601            // Add a timeout handling layer
602            // for misbehaving shell commands
603          }
604    
605          // Join all launched threads
606          // needed for when we time out
607          // and we need to release containers
608          for (Thread launchThread : launchThreads) {
609            try {
610              launchThread.join(10000);
611            } catch (InterruptedException e) {
612              LOG.info("Exception thrown in thread join: " + e.getMessage());
613              e.printStackTrace();
614            }
615          }
616    
617          // When the application completes, it should send a finish application
618          // signal to the RM
619          LOG.info("Application completed. Signalling finish to RM");
620    
621          FinalApplicationStatus appStatus;
622          String appMessage = null;
623          boolean isSuccess = true;
624          if (numFailedContainers.get() == 0) {
625            appStatus = FinalApplicationStatus.SUCCEEDED;
626          } else {
627            appStatus = FinalApplicationStatus.FAILED;
628            appMessage = "Diagnostics." + ", total=" + numTotalContainers
629                + ", completed=" + numCompletedContainers.get() + ", allocated="
630                + numAllocatedContainers.get() + ", failed="
631                + numFailedContainers.get();
632            isSuccess = false;
633          }
634          resourceManager.unregisterApplicationMaster(appStatus, appMessage, null);
635          return isSuccess;
636        } finally {
637          resourceManager.stop();
638        }
639      }
640    
641      /**
642       * Thread to connect to the {@link ContainerManager} and launch the container
643       * that will execute the shell command.
644       */
645      private class LaunchContainerRunnable implements Runnable {
646    
647        // Allocated container
648        Container container;
649        // Handle to communicate with ContainerManager
650        ContainerManager cm;
651    
652        /**
653         * @param lcontainer Allocated container
654         */
655        public LaunchContainerRunnable(Container lcontainer) {
656          this.container = lcontainer;
657        }
658    
659        /**
660         * Helper function to connect to CM
661         */
662        private void connectToCM() {
663          LOG.debug("Connecting to ContainerManager for containerid="
664              + container.getId());
665          String cmIpPortStr = container.getNodeId().getHost() + ":"
666              + container.getNodeId().getPort();
667          InetSocketAddress cmAddress = NetUtils.createSocketAddr(cmIpPortStr);
668          LOG.info("Connecting to ContainerManager at " + cmIpPortStr);
669          this.cm = ((ContainerManager) rpc.getProxy(ContainerManager.class,
670              cmAddress, conf));
671        }
672    
673        @Override
674        /**
675         * Connects to CM, sets up container launch context 
676         * for shell command and eventually dispatches the container 
677         * start request to the CM. 
678         */
679        public void run() {
680          // Connect to ContainerManager
681          connectToCM();
682    
683          LOG.info("Setting up container launch container for containerid="
684              + container.getId());
685          ContainerLaunchContext ctx = Records
686              .newRecord(ContainerLaunchContext.class);
687    
688          ctx.setContainerId(container.getId());
689          ctx.setResource(container.getResource());
690    
691          String jobUserName = System.getenv(ApplicationConstants.Environment.USER
692              .name());
693          ctx.setUser(jobUserName);
694          LOG.info("Setting user in ContainerLaunchContext to: " + jobUserName);
695    
696          // Set the environment
697          ctx.setEnvironment(shellEnv);
698    
699          // Set the local resources
700          Map<String, LocalResource> localResources = new HashMap<String, LocalResource>();
701    
702          // The container for the eventual shell commands needs its own local
703          // resources too.
704          // In this scenario, if a shell script is specified, we need to have it
705          // copied and made available to the container.
706          if (!shellScriptPath.isEmpty()) {
707            LocalResource shellRsrc = Records.newRecord(LocalResource.class);
708            shellRsrc.setType(LocalResourceType.FILE);
709            shellRsrc.setVisibility(LocalResourceVisibility.APPLICATION);
710            try {
711              shellRsrc.setResource(ConverterUtils.getYarnUrlFromURI(new URI(
712                  shellScriptPath)));
713            } catch (URISyntaxException e) {
714              LOG.error("Error when trying to use shell script path specified"
715                  + " in env, path=" + shellScriptPath);
716              e.printStackTrace();
717    
718              // A failure scenario on bad input such as invalid shell script path
719              // We know we cannot continue launching the container
720              // so we should release it.
721              // TODO
722              numCompletedContainers.incrementAndGet();
723              numFailedContainers.incrementAndGet();
724              return;
725            }
726            shellRsrc.setTimestamp(shellScriptPathTimestamp);
727            shellRsrc.setSize(shellScriptPathLen);
728            localResources.put(ExecShellStringPath, shellRsrc);
729          }
730          ctx.setLocalResources(localResources);
731    
732          // Set the necessary command to execute on the allocated container
733          Vector<CharSequence> vargs = new Vector<CharSequence>(5);
734    
735          // Set executable command
736          vargs.add(shellCommand);
737          // Set shell script path
738          if (!shellScriptPath.isEmpty()) {
739            vargs.add(ExecShellStringPath);
740          }
741    
742          // Set args for the shell command if any
743          vargs.add(shellArgs);
744          // Add log redirect params
745          vargs.add("1>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stdout");
746          vargs.add("2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stderr");
747    
748          // Get final commmand
749          StringBuilder command = new StringBuilder();
750          for (CharSequence str : vargs) {
751            command.append(str).append(" ");
752          }
753    
754          List<String> commands = new ArrayList<String>();
755          commands.add(command.toString());
756          ctx.setCommands(commands);
757    
758          StartContainerRequest startReq = Records
759              .newRecord(StartContainerRequest.class);
760          startReq.setContainerLaunchContext(ctx);
761          try {
762            cm.startContainer(startReq);
763          } catch (YarnRemoteException e) {
764            LOG.info("Start container failed for :" + ", containerId="
765                + container.getId());
766            e.printStackTrace();
767            // TODO do we need to release this container?
768          }
769    
770          // Get container status?
771          // Left commented out as the shell scripts are short lived
772          // and we are relying on the status for completed containers
773          // from RM to detect status
774    
775          // GetContainerStatusRequest statusReq =
776          // Records.newRecord(GetContainerStatusRequest.class);
777          // statusReq.setContainerId(container.getId());
778          // GetContainerStatusResponse statusResp;
779          // try {
780          // statusResp = cm.getContainerStatus(statusReq);
781          // LOG.info("Container Status"
782          // + ", id=" + container.getId()
783          // + ", status=" +statusResp.getStatus());
784          // } catch (YarnRemoteException e) {
785          // e.printStackTrace();
786          // }
787        }
788      }
789    
790      /**
791       * Setup the request that will be sent to the RM for the container ask.
792       *
793       * @param numContainers Containers to ask for from RM
794       * @return the setup ResourceRequest to be sent to RM
795       */
796      private ContainerRequest setupContainerAskForRM(int numContainers) {
797        // setup requirements for hosts
798        // using * as any host will do for the distributed shell app
799        // set the priority for the request
800        Priority pri = Records.newRecord(Priority.class);
801        // TODO - what is the range for priority? how to decide?
802        pri.setPriority(requestPriority);
803    
804        // Set up resource type requirements
805        // For now, only memory is supported so we set memory requirements
806        Resource capability = Records.newRecord(Resource.class);
807        capability.setMemory(containerMemory);
808    
809        ContainerRequest request = new ContainerRequest(capability, null, null,
810            pri, numContainers);
811        LOG.info("Requested container ask: " + request.toString());
812        return request;
813      }
814    
815      /**
816       * Ask RM to allocate given no. of containers to this Application Master
817       *
818       * @param requestedContainers Containers to ask for from RM
819       * @return Response from RM to AM with allocated containers
820       * @throws YarnRemoteException
821       */
822      private AMResponse sendContainerAskToRM() throws YarnRemoteException {
823        float progressIndicator = (float) numCompletedContainers.get()
824            / numTotalContainers;
825    
826        LOG.info("Sending request to RM for containers" + ", progress="
827            + progressIndicator);
828    
829        AllocateResponse resp = resourceManager.allocate(progressIndicator);
830        return resp.getAMResponse();
831      }
832    }