001 /** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 019 package org.apache.hadoop.yarn.applications.distributedshell; 020 021 import java.io.BufferedReader; 022 import java.io.IOException; 023 import java.io.InputStreamReader; 024 import java.net.InetSocketAddress; 025 import java.net.URI; 026 import java.net.URISyntaxException; 027 import java.util.ArrayList; 028 import java.util.HashMap; 029 import java.util.List; 030 import java.util.Map; 031 import java.util.Vector; 032 import java.util.concurrent.atomic.AtomicInteger; 033 034 import org.apache.commons.cli.CommandLine; 035 import org.apache.commons.cli.GnuParser; 036 import org.apache.commons.cli.HelpFormatter; 037 import org.apache.commons.cli.Options; 038 import org.apache.commons.cli.ParseException; 039 import org.apache.commons.logging.Log; 040 import org.apache.commons.logging.LogFactory; 041 042 import org.apache.hadoop.classification.InterfaceAudience; 043 import org.apache.hadoop.classification.InterfaceStability; 044 import org.apache.hadoop.conf.Configuration; 045 import org.apache.hadoop.net.NetUtils; 046 import org.apache.hadoop.yarn.api.AMRMProtocol; 047 import org.apache.hadoop.yarn.api.ApplicationConstants; 048 import org.apache.hadoop.yarn.api.ContainerManager; 049 050 import org.apache.hadoop.yarn.api.protocolrecords.AllocateRequest; 051 import org.apache.hadoop.yarn.api.protocolrecords.AllocateResponse; 052 import org.apache.hadoop.yarn.api.protocolrecords.FinishApplicationMasterRequest; 053 import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterResponse; 054 import org.apache.hadoop.yarn.api.protocolrecords.StartContainerRequest; 055 056 import org.apache.hadoop.yarn.api.records.AMResponse; 057 import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; 058 import org.apache.hadoop.yarn.api.records.Container; 059 import org.apache.hadoop.yarn.api.records.ContainerId; 060 import org.apache.hadoop.yarn.api.records.ContainerLaunchContext; 061 import org.apache.hadoop.yarn.api.records.ContainerState; 062 import org.apache.hadoop.yarn.api.records.ContainerStatus; 063 import org.apache.hadoop.yarn.api.records.FinalApplicationStatus; 064 import org.apache.hadoop.yarn.api.records.LocalResource; 065 import org.apache.hadoop.yarn.api.records.LocalResourceType; 066 import org.apache.hadoop.yarn.api.records.LocalResourceVisibility; 067 import org.apache.hadoop.yarn.api.records.Priority; 068 import org.apache.hadoop.yarn.api.records.Resource; 069 import org.apache.hadoop.yarn.api.records.ResourceRequest; 070 import org.apache.hadoop.yarn.client.AMRMClient; 071 import org.apache.hadoop.yarn.client.AMRMClient.ContainerRequest; 072 import org.apache.hadoop.yarn.client.AMRMClientImpl; 073 import org.apache.hadoop.yarn.conf.YarnConfiguration; 074 import org.apache.hadoop.yarn.exceptions.YarnRemoteException; 075 import org.apache.hadoop.yarn.ipc.YarnRPC; 076 import org.apache.hadoop.yarn.util.ConverterUtils; 077 import org.apache.hadoop.yarn.util.Records; 078 079 /** 080 * An ApplicationMaster for executing shell commands on a set of launched 081 * containers using the YARN framework. 082 * 083 * <p> 084 * This class is meant to act as an example on how to write yarn-based 085 * application masters. 086 * </p> 087 * 088 * <p> 089 * The ApplicationMaster is started on a container by the 090 * <code>ResourceManager</code>'s launcher. The first thing that the 091 * <code>ApplicationMaster</code> needs to do is to connect and register itself 092 * with the <code>ResourceManager</code>. The registration sets up information 093 * within the <code>ResourceManager</code> regarding what host:port the 094 * ApplicationMaster is listening on to provide any form of functionality to a 095 * client as well as a tracking url that a client can use to keep track of 096 * status/job history if needed. 097 * </p> 098 * 099 * <p> 100 * The <code>ApplicationMaster</code> needs to send a heartbeat to the 101 * <code>ResourceManager</code> at regular intervals to inform the 102 * <code>ResourceManager</code> that it is up and alive. The 103 * {@link AMRMProtocol#allocate} to the <code>ResourceManager</code> from the 104 * <code>ApplicationMaster</code> acts as a heartbeat. 105 * 106 * <p> 107 * For the actual handling of the job, the <code>ApplicationMaster</code> has to 108 * request the <code>ResourceManager</code> via {@link AllocateRequest} for the 109 * required no. of containers using {@link ResourceRequest} with the necessary 110 * resource specifications such as node location, computational 111 * (memory/disk/cpu) resource requirements. The <code>ResourceManager</code> 112 * responds with an {@link AllocateResponse} that informs the 113 * <code>ApplicationMaster</code> of the set of newly allocated containers, 114 * completed containers as well as current state of available resources. 115 * </p> 116 * 117 * <p> 118 * For each allocated container, the <code>ApplicationMaster</code> can then set 119 * up the necessary launch context via {@link ContainerLaunchContext} to specify 120 * the allocated container id, local resources required by the executable, the 121 * environment to be setup for the executable, commands to execute, etc. and 122 * submit a {@link StartContainerRequest} to the {@link ContainerManager} to 123 * launch and execute the defined commands on the given allocated container. 124 * </p> 125 * 126 * <p> 127 * The <code>ApplicationMaster</code> can monitor the launched container by 128 * either querying the <code>ResourceManager</code> using 129 * {@link AMRMProtocol#allocate} to get updates on completed containers or via 130 * the {@link ContainerManager} by querying for the status of the allocated 131 * container's {@link ContainerId}. 132 * 133 * <p> 134 * After the job has been completed, the <code>ApplicationMaster</code> has to 135 * send a {@link FinishApplicationMasterRequest} to the 136 * <code>ResourceManager</code> to inform it that the 137 * <code>ApplicationMaster</code> has been completed. 138 */ 139 @InterfaceAudience.Public 140 @InterfaceStability.Unstable 141 public class ApplicationMaster { 142 143 private static final Log LOG = LogFactory.getLog(ApplicationMaster.class); 144 145 // Configuration 146 private Configuration conf; 147 // YARN RPC to communicate with the Resource Manager or Node Manager 148 private YarnRPC rpc; 149 150 // Handle to communicate with the Resource Manager 151 private AMRMClient resourceManager; 152 153 // Application Attempt Id ( combination of attemptId and fail count ) 154 private ApplicationAttemptId appAttemptID; 155 156 // TODO 157 // For status update for clients - yet to be implemented 158 // Hostname of the container 159 private String appMasterHostname = ""; 160 // Port on which the app master listens for status updates from clients 161 private int appMasterRpcPort = 0; 162 // Tracking url to which app master publishes info for clients to monitor 163 private String appMasterTrackingUrl = ""; 164 165 // App Master configuration 166 // No. of containers to run shell command on 167 private int numTotalContainers = 1; 168 // Memory to request for the container on which the shell command will run 169 private int containerMemory = 10; 170 // Priority of the request 171 private int requestPriority; 172 173 // Simple flag to denote whether all works is done 174 private boolean appDone = false; 175 // Counter for completed containers ( complete denotes successful or failed ) 176 private AtomicInteger numCompletedContainers = new AtomicInteger(); 177 // Allocated container count so that we know how many containers has the RM 178 // allocated to us 179 private AtomicInteger numAllocatedContainers = new AtomicInteger(); 180 // Count of failed containers 181 private AtomicInteger numFailedContainers = new AtomicInteger(); 182 // Count of containers already requested from the RM 183 // Needed as once requested, we should not request for containers again. 184 // Only request for more if the original requirement changes. 185 private AtomicInteger numRequestedContainers = new AtomicInteger(); 186 187 // Shell command to be executed 188 private String shellCommand = ""; 189 // Args to be passed to the shell command 190 private String shellArgs = ""; 191 // Env variables to be setup for the shell command 192 private Map<String, String> shellEnv = new HashMap<String, String>(); 193 194 // Location of shell script ( obtained from info set in env ) 195 // Shell script path in fs 196 private String shellScriptPath = ""; 197 // Timestamp needed for creating a local resource 198 private long shellScriptPathTimestamp = 0; 199 // File length needed for local resource 200 private long shellScriptPathLen = 0; 201 202 // Hardcoded path to shell script in launch container's local env 203 private final String ExecShellStringPath = "ExecShellScript.sh"; 204 205 // Launch threads 206 private List<Thread> launchThreads = new ArrayList<Thread>(); 207 208 /** 209 * @param args Command line args 210 */ 211 public static void main(String[] args) { 212 boolean result = false; 213 try { 214 ApplicationMaster appMaster = new ApplicationMaster(); 215 LOG.info("Initializing ApplicationMaster"); 216 boolean doRun = appMaster.init(args); 217 if (!doRun) { 218 System.exit(0); 219 } 220 result = appMaster.run(); 221 } catch (Throwable t) { 222 LOG.fatal("Error running ApplicationMaster", t); 223 System.exit(1); 224 } 225 if (result) { 226 LOG.info("Application Master completed successfully. exiting"); 227 System.exit(0); 228 } else { 229 LOG.info("Application Master failed. exiting"); 230 System.exit(2); 231 } 232 } 233 234 /** 235 * Dump out contents of $CWD and the environment to stdout for debugging 236 */ 237 private void dumpOutDebugInfo() { 238 239 LOG.info("Dump debug output"); 240 Map<String, String> envs = System.getenv(); 241 for (Map.Entry<String, String> env : envs.entrySet()) { 242 LOG.info("System env: key=" + env.getKey() + ", val=" + env.getValue()); 243 System.out.println("System env: key=" + env.getKey() + ", val=" 244 + env.getValue()); 245 } 246 247 String cmd = "ls -al"; 248 Runtime run = Runtime.getRuntime(); 249 Process pr = null; 250 try { 251 pr = run.exec(cmd); 252 pr.waitFor(); 253 254 BufferedReader buf = new BufferedReader(new InputStreamReader( 255 pr.getInputStream())); 256 String line = ""; 257 while ((line = buf.readLine()) != null) { 258 LOG.info("System CWD content: " + line); 259 System.out.println("System CWD content: " + line); 260 } 261 buf.close(); 262 } catch (IOException e) { 263 e.printStackTrace(); 264 } catch (InterruptedException e) { 265 e.printStackTrace(); 266 } 267 } 268 269 public ApplicationMaster() throws Exception { 270 // Set up the configuration and RPC 271 conf = new YarnConfiguration(); 272 rpc = YarnRPC.create(conf); 273 } 274 275 /** 276 * Parse command line options 277 * 278 * @param args Command line args 279 * @return Whether init successful and run should be invoked 280 * @throws ParseException 281 * @throws IOException 282 */ 283 public boolean init(String[] args) throws ParseException, IOException { 284 285 Options opts = new Options(); 286 opts.addOption("app_attempt_id", true, 287 "App Attempt ID. Not to be used unless for testing purposes"); 288 opts.addOption("shell_command", true, 289 "Shell command to be executed by the Application Master"); 290 opts.addOption("shell_script", true, 291 "Location of the shell script to be executed"); 292 opts.addOption("shell_args", true, "Command line args for the shell script"); 293 opts.addOption("shell_env", true, 294 "Environment for shell script. Specified as env_key=env_val pairs"); 295 opts.addOption("container_memory", true, 296 "Amount of memory in MB to be requested to run the shell command"); 297 opts.addOption("num_containers", true, 298 "No. of containers on which the shell command needs to be executed"); 299 opts.addOption("priority", true, "Application Priority. Default 0"); 300 opts.addOption("debug", false, "Dump out debug information"); 301 302 opts.addOption("help", false, "Print usage"); 303 CommandLine cliParser = new GnuParser().parse(opts, args); 304 305 if (args.length == 0) { 306 printUsage(opts); 307 throw new IllegalArgumentException( 308 "No args specified for application master to initialize"); 309 } 310 311 if (cliParser.hasOption("help")) { 312 printUsage(opts); 313 return false; 314 } 315 316 if (cliParser.hasOption("debug")) { 317 dumpOutDebugInfo(); 318 } 319 320 Map<String, String> envs = System.getenv(); 321 322 if (envs.containsKey(ApplicationConstants.AM_APP_ATTEMPT_ID_ENV)) { 323 appAttemptID = ConverterUtils.toApplicationAttemptId(envs 324 .get(ApplicationConstants.AM_APP_ATTEMPT_ID_ENV)); 325 } else if (!envs.containsKey(ApplicationConstants.AM_CONTAINER_ID_ENV)) { 326 if (cliParser.hasOption("app_attempt_id")) { 327 String appIdStr = cliParser.getOptionValue("app_attempt_id", ""); 328 appAttemptID = ConverterUtils.toApplicationAttemptId(appIdStr); 329 } else { 330 throw new IllegalArgumentException( 331 "Application Attempt Id not set in the environment"); 332 } 333 } else { 334 ContainerId containerId = ConverterUtils.toContainerId(envs 335 .get(ApplicationConstants.AM_CONTAINER_ID_ENV)); 336 appAttemptID = containerId.getApplicationAttemptId(); 337 } 338 339 LOG.info("Application master for app" + ", appId=" 340 + appAttemptID.getApplicationId().getId() + ", clustertimestamp=" 341 + appAttemptID.getApplicationId().getClusterTimestamp() 342 + ", attemptId=" + appAttemptID.getAttemptId()); 343 344 if (!cliParser.hasOption("shell_command")) { 345 throw new IllegalArgumentException( 346 "No shell command specified to be executed by application master"); 347 } 348 shellCommand = cliParser.getOptionValue("shell_command"); 349 350 if (cliParser.hasOption("shell_args")) { 351 shellArgs = cliParser.getOptionValue("shell_args"); 352 } 353 if (cliParser.hasOption("shell_env")) { 354 String shellEnvs[] = cliParser.getOptionValues("shell_env"); 355 for (String env : shellEnvs) { 356 env = env.trim(); 357 int index = env.indexOf('='); 358 if (index == -1) { 359 shellEnv.put(env, ""); 360 continue; 361 } 362 String key = env.substring(0, index); 363 String val = ""; 364 if (index < (env.length() - 1)) { 365 val = env.substring(index + 1); 366 } 367 shellEnv.put(key, val); 368 } 369 } 370 371 if (envs.containsKey(DSConstants.DISTRIBUTEDSHELLSCRIPTLOCATION)) { 372 shellScriptPath = envs.get(DSConstants.DISTRIBUTEDSHELLSCRIPTLOCATION); 373 374 if (envs.containsKey(DSConstants.DISTRIBUTEDSHELLSCRIPTTIMESTAMP)) { 375 shellScriptPathTimestamp = Long.valueOf(envs 376 .get(DSConstants.DISTRIBUTEDSHELLSCRIPTTIMESTAMP)); 377 } 378 if (envs.containsKey(DSConstants.DISTRIBUTEDSHELLSCRIPTLEN)) { 379 shellScriptPathLen = Long.valueOf(envs 380 .get(DSConstants.DISTRIBUTEDSHELLSCRIPTLEN)); 381 } 382 383 if (!shellScriptPath.isEmpty() 384 && (shellScriptPathTimestamp <= 0 || shellScriptPathLen <= 0)) { 385 LOG.error("Illegal values in env for shell script path" + ", path=" 386 + shellScriptPath + ", len=" + shellScriptPathLen + ", timestamp=" 387 + shellScriptPathTimestamp); 388 throw new IllegalArgumentException( 389 "Illegal values in env for shell script path"); 390 } 391 } 392 393 containerMemory = Integer.parseInt(cliParser.getOptionValue( 394 "container_memory", "10")); 395 numTotalContainers = Integer.parseInt(cliParser.getOptionValue( 396 "num_containers", "1")); 397 requestPriority = Integer.parseInt(cliParser 398 .getOptionValue("priority", "0")); 399 400 return true; 401 } 402 403 /** 404 * Helper function to print usage 405 * 406 * @param opts Parsed command line options 407 */ 408 private void printUsage(Options opts) { 409 new HelpFormatter().printHelp("ApplicationMaster", opts); 410 } 411 412 /** 413 * Main run function for the application master 414 * 415 * @throws YarnRemoteException 416 */ 417 public boolean run() throws YarnRemoteException { 418 LOG.info("Starting ApplicationMaster"); 419 420 // Connect to ResourceManager 421 resourceManager = new AMRMClientImpl(appAttemptID); 422 resourceManager.init(conf); 423 resourceManager.start(); 424 425 try { 426 // Setup local RPC Server to accept status requests directly from clients 427 // TODO need to setup a protocol for client to be able to communicate to 428 // the RPC server 429 // TODO use the rpc port info to register with the RM for the client to 430 // send requests to this app master 431 432 // Register self with ResourceManager 433 RegisterApplicationMasterResponse response = resourceManager 434 .registerApplicationMaster(appMasterHostname, appMasterRpcPort, 435 appMasterTrackingUrl); 436 // Dump out information about cluster capability as seen by the 437 // resource manager 438 int minMem = response.getMinimumResourceCapability().getMemory(); 439 int maxMem = response.getMaximumResourceCapability().getMemory(); 440 LOG.info("Min mem capabililty of resources in this cluster " + minMem); 441 LOG.info("Max mem capabililty of resources in this cluster " + maxMem); 442 443 // A resource ask has to be atleast the minimum of the capability of the 444 // cluster, the value has to be a multiple of the min value and cannot 445 // exceed the max. 446 // If it is not an exact multiple of min, the RM will allocate to the 447 // nearest multiple of min 448 if (containerMemory < minMem) { 449 LOG.info("Container memory specified below min threshold of cluster." 450 + " Using min value." + ", specified=" + containerMemory + ", min=" 451 + minMem); 452 containerMemory = minMem; 453 } else if (containerMemory > maxMem) { 454 LOG.info("Container memory specified above max threshold of cluster." 455 + " Using max value." + ", specified=" + containerMemory + ", max=" 456 + maxMem); 457 containerMemory = maxMem; 458 } 459 460 // Setup heartbeat emitter 461 // TODO poll RM every now and then with an empty request to let RM know 462 // that we are alive 463 // The heartbeat interval after which an AM is timed out by the RM is 464 // defined by a config setting: 465 // RM_AM_EXPIRY_INTERVAL_MS with default defined by 466 // DEFAULT_RM_AM_EXPIRY_INTERVAL_MS 467 // The allocate calls to the RM count as heartbeats so, for now, 468 // this additional heartbeat emitter is not required. 469 470 // Setup ask for containers from RM 471 // Send request for containers to RM 472 // Until we get our fully allocated quota, we keep on polling RM for 473 // containers 474 // Keep looping until all the containers are launched and shell script 475 // executed on them ( regardless of success/failure). 476 477 int loopCounter = -1; 478 479 while (numCompletedContainers.get() < numTotalContainers && !appDone) { 480 loopCounter++; 481 482 // log current state 483 LOG.info("Current application state: loop=" + loopCounter 484 + ", appDone=" + appDone + ", total=" + numTotalContainers 485 + ", requested=" + numRequestedContainers + ", completed=" 486 + numCompletedContainers + ", failed=" + numFailedContainers 487 + ", currentAllocated=" + numAllocatedContainers); 488 489 // Sleep before each loop when asking RM for containers 490 // to avoid flooding RM with spurious requests when it 491 // need not have any available containers 492 // Sleeping for 1000 ms. 493 try { 494 Thread.sleep(1000); 495 } catch (InterruptedException e) { 496 LOG.info("Sleep interrupted " + e.getMessage()); 497 } 498 499 // No. of containers to request 500 // For the first loop, askCount will be equal to total containers needed 501 // From that point on, askCount will always be 0 as current 502 // implementation does not change its ask on container failures. 503 int askCount = numTotalContainers - numRequestedContainers.get(); 504 numRequestedContainers.addAndGet(askCount); 505 506 if (askCount > 0) { 507 ContainerRequest containerAsk = setupContainerAskForRM(askCount); 508 resourceManager.addContainerRequest(containerAsk); 509 } 510 511 // Send the request to RM 512 LOG.info("Asking RM for containers" + ", askCount=" + askCount); 513 AMResponse amResp = sendContainerAskToRM(); 514 515 // Retrieve list of allocated containers from the response 516 List<Container> allocatedContainers = amResp.getAllocatedContainers(); 517 LOG.info("Got response from RM for container ask, allocatedCnt=" 518 + allocatedContainers.size()); 519 numAllocatedContainers.addAndGet(allocatedContainers.size()); 520 for (Container allocatedContainer : allocatedContainers) { 521 LOG.info("Launching shell command on a new container." 522 + ", containerId=" + allocatedContainer.getId() 523 + ", containerNode=" + allocatedContainer.getNodeId().getHost() 524 + ":" + allocatedContainer.getNodeId().getPort() 525 + ", containerNodeURI=" + allocatedContainer.getNodeHttpAddress() 526 + ", containerState" + allocatedContainer.getState() 527 + ", containerResourceMemory" 528 + allocatedContainer.getResource().getMemory()); 529 // + ", containerToken" 530 // +allocatedContainer.getContainerToken().getIdentifier().toString()); 531 532 LaunchContainerRunnable runnableLaunchContainer = new LaunchContainerRunnable( 533 allocatedContainer); 534 Thread launchThread = new Thread(runnableLaunchContainer); 535 536 // launch and start the container on a separate thread to keep 537 // the main thread unblocked 538 // as all containers may not be allocated at one go. 539 launchThreads.add(launchThread); 540 launchThread.start(); 541 } 542 543 // Check what the current available resources in the cluster are 544 // TODO should we do anything if the available resources are not enough? 545 Resource availableResources = amResp.getAvailableResources(); 546 LOG.info("Current available resources in the cluster " 547 + availableResources); 548 549 // Check the completed containers 550 List<ContainerStatus> completedContainers = amResp 551 .getCompletedContainersStatuses(); 552 LOG.info("Got response from RM for container ask, completedCnt=" 553 + completedContainers.size()); 554 for (ContainerStatus containerStatus : completedContainers) { 555 LOG.info("Got container status for containerID=" 556 + containerStatus.getContainerId() + ", state=" 557 + containerStatus.getState() + ", exitStatus=" 558 + containerStatus.getExitStatus() + ", diagnostics=" 559 + containerStatus.getDiagnostics()); 560 561 // non complete containers should not be here 562 assert (containerStatus.getState() == ContainerState.COMPLETE); 563 564 // increment counters for completed/failed containers 565 int exitStatus = containerStatus.getExitStatus(); 566 if (0 != exitStatus) { 567 // container failed 568 if (-100 != exitStatus) { 569 // shell script failed 570 // counts as completed 571 numCompletedContainers.incrementAndGet(); 572 numFailedContainers.incrementAndGet(); 573 } else { 574 // something else bad happened 575 // app job did not complete for some reason 576 // we should re-try as the container was lost for some reason 577 numAllocatedContainers.decrementAndGet(); 578 numRequestedContainers.decrementAndGet(); 579 // we do not need to release the container as it would be done 580 // by the RM/CM. 581 } 582 } else { 583 // nothing to do 584 // container completed successfully 585 numCompletedContainers.incrementAndGet(); 586 LOG.info("Container completed successfully." + ", containerId=" 587 + containerStatus.getContainerId()); 588 } 589 } 590 if (numCompletedContainers.get() == numTotalContainers) { 591 appDone = true; 592 } 593 594 LOG.info("Current application state: loop=" + loopCounter 595 + ", appDone=" + appDone + ", total=" + numTotalContainers 596 + ", requested=" + numRequestedContainers + ", completed=" 597 + numCompletedContainers + ", failed=" + numFailedContainers 598 + ", currentAllocated=" + numAllocatedContainers); 599 600 // TODO 601 // Add a timeout handling layer 602 // for misbehaving shell commands 603 } 604 605 // Join all launched threads 606 // needed for when we time out 607 // and we need to release containers 608 for (Thread launchThread : launchThreads) { 609 try { 610 launchThread.join(10000); 611 } catch (InterruptedException e) { 612 LOG.info("Exception thrown in thread join: " + e.getMessage()); 613 e.printStackTrace(); 614 } 615 } 616 617 // When the application completes, it should send a finish application 618 // signal to the RM 619 LOG.info("Application completed. Signalling finish to RM"); 620 621 FinalApplicationStatus appStatus; 622 String appMessage = null; 623 boolean isSuccess = true; 624 if (numFailedContainers.get() == 0) { 625 appStatus = FinalApplicationStatus.SUCCEEDED; 626 } else { 627 appStatus = FinalApplicationStatus.FAILED; 628 appMessage = "Diagnostics." + ", total=" + numTotalContainers 629 + ", completed=" + numCompletedContainers.get() + ", allocated=" 630 + numAllocatedContainers.get() + ", failed=" 631 + numFailedContainers.get(); 632 isSuccess = false; 633 } 634 resourceManager.unregisterApplicationMaster(appStatus, appMessage, null); 635 return isSuccess; 636 } finally { 637 resourceManager.stop(); 638 } 639 } 640 641 /** 642 * Thread to connect to the {@link ContainerManager} and launch the container 643 * that will execute the shell command. 644 */ 645 private class LaunchContainerRunnable implements Runnable { 646 647 // Allocated container 648 Container container; 649 // Handle to communicate with ContainerManager 650 ContainerManager cm; 651 652 /** 653 * @param lcontainer Allocated container 654 */ 655 public LaunchContainerRunnable(Container lcontainer) { 656 this.container = lcontainer; 657 } 658 659 /** 660 * Helper function to connect to CM 661 */ 662 private void connectToCM() { 663 LOG.debug("Connecting to ContainerManager for containerid=" 664 + container.getId()); 665 String cmIpPortStr = container.getNodeId().getHost() + ":" 666 + container.getNodeId().getPort(); 667 InetSocketAddress cmAddress = NetUtils.createSocketAddr(cmIpPortStr); 668 LOG.info("Connecting to ContainerManager at " + cmIpPortStr); 669 this.cm = ((ContainerManager) rpc.getProxy(ContainerManager.class, 670 cmAddress, conf)); 671 } 672 673 @Override 674 /** 675 * Connects to CM, sets up container launch context 676 * for shell command and eventually dispatches the container 677 * start request to the CM. 678 */ 679 public void run() { 680 // Connect to ContainerManager 681 connectToCM(); 682 683 LOG.info("Setting up container launch container for containerid=" 684 + container.getId()); 685 ContainerLaunchContext ctx = Records 686 .newRecord(ContainerLaunchContext.class); 687 688 ctx.setContainerId(container.getId()); 689 ctx.setResource(container.getResource()); 690 691 String jobUserName = System.getenv(ApplicationConstants.Environment.USER 692 .name()); 693 ctx.setUser(jobUserName); 694 LOG.info("Setting user in ContainerLaunchContext to: " + jobUserName); 695 696 // Set the environment 697 ctx.setEnvironment(shellEnv); 698 699 // Set the local resources 700 Map<String, LocalResource> localResources = new HashMap<String, LocalResource>(); 701 702 // The container for the eventual shell commands needs its own local 703 // resources too. 704 // In this scenario, if a shell script is specified, we need to have it 705 // copied and made available to the container. 706 if (!shellScriptPath.isEmpty()) { 707 LocalResource shellRsrc = Records.newRecord(LocalResource.class); 708 shellRsrc.setType(LocalResourceType.FILE); 709 shellRsrc.setVisibility(LocalResourceVisibility.APPLICATION); 710 try { 711 shellRsrc.setResource(ConverterUtils.getYarnUrlFromURI(new URI( 712 shellScriptPath))); 713 } catch (URISyntaxException e) { 714 LOG.error("Error when trying to use shell script path specified" 715 + " in env, path=" + shellScriptPath); 716 e.printStackTrace(); 717 718 // A failure scenario on bad input such as invalid shell script path 719 // We know we cannot continue launching the container 720 // so we should release it. 721 // TODO 722 numCompletedContainers.incrementAndGet(); 723 numFailedContainers.incrementAndGet(); 724 return; 725 } 726 shellRsrc.setTimestamp(shellScriptPathTimestamp); 727 shellRsrc.setSize(shellScriptPathLen); 728 localResources.put(ExecShellStringPath, shellRsrc); 729 } 730 ctx.setLocalResources(localResources); 731 732 // Set the necessary command to execute on the allocated container 733 Vector<CharSequence> vargs = new Vector<CharSequence>(5); 734 735 // Set executable command 736 vargs.add(shellCommand); 737 // Set shell script path 738 if (!shellScriptPath.isEmpty()) { 739 vargs.add(ExecShellStringPath); 740 } 741 742 // Set args for the shell command if any 743 vargs.add(shellArgs); 744 // Add log redirect params 745 vargs.add("1>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stdout"); 746 vargs.add("2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stderr"); 747 748 // Get final commmand 749 StringBuilder command = new StringBuilder(); 750 for (CharSequence str : vargs) { 751 command.append(str).append(" "); 752 } 753 754 List<String> commands = new ArrayList<String>(); 755 commands.add(command.toString()); 756 ctx.setCommands(commands); 757 758 StartContainerRequest startReq = Records 759 .newRecord(StartContainerRequest.class); 760 startReq.setContainerLaunchContext(ctx); 761 try { 762 cm.startContainer(startReq); 763 } catch (YarnRemoteException e) { 764 LOG.info("Start container failed for :" + ", containerId=" 765 + container.getId()); 766 e.printStackTrace(); 767 // TODO do we need to release this container? 768 } 769 770 // Get container status? 771 // Left commented out as the shell scripts are short lived 772 // and we are relying on the status for completed containers 773 // from RM to detect status 774 775 // GetContainerStatusRequest statusReq = 776 // Records.newRecord(GetContainerStatusRequest.class); 777 // statusReq.setContainerId(container.getId()); 778 // GetContainerStatusResponse statusResp; 779 // try { 780 // statusResp = cm.getContainerStatus(statusReq); 781 // LOG.info("Container Status" 782 // + ", id=" + container.getId() 783 // + ", status=" +statusResp.getStatus()); 784 // } catch (YarnRemoteException e) { 785 // e.printStackTrace(); 786 // } 787 } 788 } 789 790 /** 791 * Setup the request that will be sent to the RM for the container ask. 792 * 793 * @param numContainers Containers to ask for from RM 794 * @return the setup ResourceRequest to be sent to RM 795 */ 796 private ContainerRequest setupContainerAskForRM(int numContainers) { 797 // setup requirements for hosts 798 // using * as any host will do for the distributed shell app 799 // set the priority for the request 800 Priority pri = Records.newRecord(Priority.class); 801 // TODO - what is the range for priority? how to decide? 802 pri.setPriority(requestPriority); 803 804 // Set up resource type requirements 805 // For now, only memory is supported so we set memory requirements 806 Resource capability = Records.newRecord(Resource.class); 807 capability.setMemory(containerMemory); 808 809 ContainerRequest request = new ContainerRequest(capability, null, null, 810 pri, numContainers); 811 LOG.info("Requested container ask: " + request.toString()); 812 return request; 813 } 814 815 /** 816 * Ask RM to allocate given no. of containers to this Application Master 817 * 818 * @param requestedContainers Containers to ask for from RM 819 * @return Response from RM to AM with allocated containers 820 * @throws YarnRemoteException 821 */ 822 private AMResponse sendContainerAskToRM() throws YarnRemoteException { 823 float progressIndicator = (float) numCompletedContainers.get() 824 / numTotalContainers; 825 826 LOG.info("Sending request to RM for containers" + ", progress=" 827 + progressIndicator); 828 829 AllocateResponse resp = resourceManager.allocate(progressIndicator); 830 return resp.getAMResponse(); 831 } 832 }