001 /** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 019 package org.apache.hadoop.yarn.applications.distributedshell; 020 021 import java.io.BufferedReader; 022 import java.io.IOException; 023 import java.io.InputStreamReader; 024 import java.net.InetSocketAddress; 025 import java.net.URI; 026 import java.net.URISyntaxException; 027 import java.util.ArrayList; 028 import java.util.HashMap; 029 import java.util.List; 030 import java.util.Map; 031 import java.util.Vector; 032 import java.util.concurrent.CopyOnWriteArrayList; 033 import java.util.concurrent.atomic.AtomicInteger; 034 035 import org.apache.commons.cli.CommandLine; 036 import org.apache.commons.cli.GnuParser; 037 import org.apache.commons.cli.HelpFormatter; 038 import org.apache.commons.cli.Options; 039 import org.apache.commons.cli.ParseException; 040 import org.apache.commons.logging.Log; 041 import org.apache.commons.logging.LogFactory; 042 043 import org.apache.hadoop.classification.InterfaceAudience; 044 import org.apache.hadoop.classification.InterfaceStability; 045 import org.apache.hadoop.conf.Configuration; 046 import org.apache.hadoop.net.NetUtils; 047 import org.apache.hadoop.yarn.api.AMRMProtocol; 048 import org.apache.hadoop.yarn.api.ApplicationConstants; 049 import org.apache.hadoop.yarn.api.ContainerManager; 050 051 import org.apache.hadoop.yarn.api.protocolrecords.AllocateRequest; 052 import org.apache.hadoop.yarn.api.protocolrecords.AllocateResponse; 053 import org.apache.hadoop.yarn.api.protocolrecords.FinishApplicationMasterRequest; 054 //import org.apache.hadoop.yarn.api.protocolrecords.GetContainerStatusRequest; 055 //import org.apache.hadoop.yarn.api.protocolrecords.GetContainerStatusResponse; 056 import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterRequest; 057 import org.apache.hadoop.yarn.api.protocolrecords.RegisterApplicationMasterResponse; 058 import org.apache.hadoop.yarn.api.protocolrecords.StartContainerRequest; 059 060 import org.apache.hadoop.yarn.api.records.AMResponse; 061 import org.apache.hadoop.yarn.api.records.ApplicationAttemptId; 062 import org.apache.hadoop.yarn.api.records.Container; 063 import org.apache.hadoop.yarn.api.records.ContainerId; 064 import org.apache.hadoop.yarn.api.records.ContainerLaunchContext; 065 import org.apache.hadoop.yarn.api.records.ContainerState; 066 import org.apache.hadoop.yarn.api.records.ContainerStatus; 067 import org.apache.hadoop.yarn.api.records.FinalApplicationStatus; 068 import org.apache.hadoop.yarn.api.records.LocalResource; 069 import org.apache.hadoop.yarn.api.records.LocalResourceType; 070 import org.apache.hadoop.yarn.api.records.LocalResourceVisibility; 071 import org.apache.hadoop.yarn.api.records.Priority; 072 import org.apache.hadoop.yarn.api.records.Resource; 073 import org.apache.hadoop.yarn.api.records.ResourceRequest; 074 import org.apache.hadoop.yarn.conf.YarnConfiguration; 075 import org.apache.hadoop.yarn.exceptions.YarnRemoteException; 076 import org.apache.hadoop.yarn.ipc.YarnRPC; 077 import org.apache.hadoop.yarn.util.ConverterUtils; 078 import org.apache.hadoop.yarn.util.Records; 079 080 /** 081 * An ApplicationMaster for executing shell commands on a set of launched containers using the YARN framework. 082 * 083 * <p>This class is meant to act as an example on how to write yarn-based application masters. </p> 084 * 085 * <p> The ApplicationMaster is started on a container by the <code>ResourceManager</code>'s launcher. 086 * The first thing that the <code>ApplicationMaster</code> needs to do is to connect and register itself with 087 * the <code>ResourceManager</code>. The registration sets up information within the <code>ResourceManager</code> 088 * regarding what host:port the ApplicationMaster is listening on to provide any form of functionality to a client 089 * as well as a tracking url that a client can use to keep track of status/job history if needed. </p> 090 * 091 * <p> The <code>ApplicationMaster</code> needs to send a heartbeat to the <code>ResourceManager</code> at regular intervals 092 * to inform the <code>ResourceManager</code> that it is up and alive. The {@link AMRMProtocol#allocate} to the 093 * <code>ResourceManager</code> from the <code>ApplicationMaster</code> acts as a heartbeat. 094 * 095 * <p> For the actual handling of the job, the <code>ApplicationMaster</code> has to request the 096 * <code>ResourceManager</code> via {@link AllocateRequest} for the required no. of containers using {@link ResourceRequest} 097 * with the necessary resource specifications such as node location, computational (memory/disk/cpu) resource requirements. 098 * The <code>ResourceManager</code> responds with an {@link AllocateResponse} that informs the <code>ApplicationMaster</code> 099 * of the set of newly allocated containers, completed containers as well as current state of available resources. </p> 100 * 101 * <p> For each allocated container, the <code>ApplicationMaster</code> can then set up the necessary launch context via 102 * {@link ContainerLaunchContext} to specify the allocated container id, local resources required by the executable, 103 * the environment to be setup for the executable, commands to execute, etc. and submit a {@link StartContainerRequest} 104 * to the {@link ContainerManager} to launch and execute the defined commands on the given allocated container. </p> 105 * 106 * <p> The <code>ApplicationMaster</code> can monitor the launched container by either querying the <code>ResourceManager</code> 107 * using {@link AMRMProtocol#allocate} to get updates on completed containers or via the {@link ContainerManager} 108 * by querying for the status of the allocated container's {@link ContainerId}. 109 * 110 * <p> After the job has been completed, the <code>ApplicationMaster</code> has to send a {@link FinishApplicationMasterRequest} 111 * to the <code>ResourceManager</code> to inform it that the <code>ApplicationMaster</code> has been completed. 112 */ 113 @InterfaceAudience.Public 114 @InterfaceStability.Unstable 115 public class ApplicationMaster { 116 117 private static final Log LOG = LogFactory.getLog(ApplicationMaster.class); 118 119 // Configuration 120 private Configuration conf; 121 // YARN RPC to communicate with the Resource Manager or Node Manager 122 private YarnRPC rpc; 123 124 // Handle to communicate with the Resource Manager 125 private AMRMProtocol resourceManager; 126 127 // Application Attempt Id ( combination of attemptId and fail count ) 128 private ApplicationAttemptId appAttemptID; 129 130 // TODO 131 // For status update for clients - yet to be implemented 132 // Hostname of the container 133 private String appMasterHostname = ""; 134 // Port on which the app master listens for status update requests from clients 135 private int appMasterRpcPort = 0; 136 // Tracking url to which app master publishes info for clients to monitor 137 private String appMasterTrackingUrl = ""; 138 139 // App Master configuration 140 // No. of containers to run shell command on 141 private int numTotalContainers = 1; 142 // Memory to request for the container on which the shell command will run 143 private int containerMemory = 10; 144 // Priority of the request 145 private int requestPriority; 146 147 // Incremental counter for rpc calls to the RM 148 private AtomicInteger rmRequestID = new AtomicInteger(); 149 150 // Simple flag to denote whether all works is done 151 private boolean appDone = false; 152 // Counter for completed containers ( complete denotes successful or failed ) 153 private AtomicInteger numCompletedContainers = new AtomicInteger(); 154 // Allocated container count so that we know how many containers has the RM 155 // allocated to us 156 private AtomicInteger numAllocatedContainers = new AtomicInteger(); 157 // Count of failed containers 158 private AtomicInteger numFailedContainers = new AtomicInteger(); 159 // Count of containers already requested from the RM 160 // Needed as once requested, we should not request for containers again and again. 161 // Only request for more if the original requirement changes. 162 private AtomicInteger numRequestedContainers = new AtomicInteger(); 163 164 // Shell command to be executed 165 private String shellCommand = ""; 166 // Args to be passed to the shell command 167 private String shellArgs = ""; 168 // Env variables to be setup for the shell command 169 private Map<String, String> shellEnv = new HashMap<String, String>(); 170 171 // Location of shell script ( obtained from info set in env ) 172 // Shell script path in fs 173 private String shellScriptPath = ""; 174 // Timestamp needed for creating a local resource 175 private long shellScriptPathTimestamp = 0; 176 // File length needed for local resource 177 private long shellScriptPathLen = 0; 178 179 // Hardcoded path to shell script in launch container's local env 180 private final String ExecShellStringPath = "ExecShellScript.sh"; 181 182 // Containers to be released 183 private CopyOnWriteArrayList<ContainerId> releasedContainers = new CopyOnWriteArrayList<ContainerId>(); 184 185 // Launch threads 186 private List<Thread> launchThreads = new ArrayList<Thread>(); 187 188 /** 189 * @param args Command line args 190 */ 191 public static void main(String[] args) { 192 boolean result = false; 193 try { 194 ApplicationMaster appMaster = new ApplicationMaster(); 195 LOG.info("Initializing ApplicationMaster"); 196 boolean doRun = appMaster.init(args); 197 if (!doRun) { 198 System.exit(0); 199 } 200 result = appMaster.run(); 201 } catch (Throwable t) { 202 LOG.fatal("Error running ApplicationMaster", t); 203 System.exit(1); 204 } 205 if (result) { 206 LOG.info("Application Master completed successfully. exiting"); 207 System.exit(0); 208 } 209 else { 210 LOG.info("Application Master failed. exiting"); 211 System.exit(2); 212 } 213 } 214 215 /** 216 * Dump out contents of $CWD and the environment to stdout for debugging 217 */ 218 private void dumpOutDebugInfo() { 219 220 LOG.info("Dump debug output"); 221 Map<String, String> envs = System.getenv(); 222 for (Map.Entry<String, String> env : envs.entrySet()) { 223 LOG.info("System env: key=" + env.getKey() + ", val=" + env.getValue()); 224 System.out.println("System env: key=" + env.getKey() + ", val=" + env.getValue()); 225 } 226 227 String cmd = "ls -al"; 228 Runtime run = Runtime.getRuntime(); 229 Process pr = null; 230 try { 231 pr = run.exec(cmd); 232 pr.waitFor(); 233 234 BufferedReader buf = new BufferedReader(new InputStreamReader(pr.getInputStream())); 235 String line = ""; 236 while ((line=buf.readLine())!=null) { 237 LOG.info("System CWD content: " + line); 238 System.out.println("System CWD content: " + line); 239 } 240 buf.close(); 241 } catch (IOException e) { 242 e.printStackTrace(); 243 } catch (InterruptedException e) { 244 e.printStackTrace(); 245 } 246 } 247 248 public ApplicationMaster() throws Exception { 249 // Set up the configuration and RPC 250 conf = new Configuration(); 251 rpc = YarnRPC.create(conf); 252 } 253 /** 254 * Parse command line options 255 * @param args Command line args 256 * @return Whether init successful and run should be invoked 257 * @throws ParseException 258 * @throws IOException 259 */ 260 public boolean init(String[] args) throws ParseException, IOException { 261 262 Options opts = new Options(); 263 opts.addOption("app_attempt_id", true, "App Attempt ID. Not to be used unless for testing purposes"); 264 opts.addOption("shell_command", true, "Shell command to be executed by the Application Master"); 265 opts.addOption("shell_script", true, "Location of the shell script to be executed"); 266 opts.addOption("shell_args", true, "Command line args for the shell script"); 267 opts.addOption("shell_env", true, "Environment for shell script. Specified as env_key=env_val pairs"); 268 opts.addOption("container_memory", true, "Amount of memory in MB to be requested to run the shell command"); 269 opts.addOption("num_containers", true, "No. of containers on which the shell command needs to be executed"); 270 opts.addOption("priority", true, "Application Priority. Default 0"); 271 opts.addOption("debug", false, "Dump out debug information"); 272 273 opts.addOption("help", false, "Print usage"); 274 CommandLine cliParser = new GnuParser().parse(opts, args); 275 276 if (args.length == 0) { 277 printUsage(opts); 278 throw new IllegalArgumentException("No args specified for application master to initialize"); 279 } 280 281 if (cliParser.hasOption("help")) { 282 printUsage(opts); 283 return false; 284 } 285 286 if (cliParser.hasOption("debug")) { 287 dumpOutDebugInfo(); 288 } 289 290 Map<String, String> envs = System.getenv(); 291 292 appAttemptID = Records.newRecord(ApplicationAttemptId.class); 293 if (envs.containsKey(ApplicationConstants.AM_APP_ATTEMPT_ID_ENV)) { 294 appAttemptID = ConverterUtils.toApplicationAttemptId(envs 295 .get(ApplicationConstants.AM_APP_ATTEMPT_ID_ENV)); 296 } else if (!envs.containsKey(ApplicationConstants.AM_CONTAINER_ID_ENV)) { 297 if (cliParser.hasOption("app_attempt_id")) { 298 String appIdStr = cliParser.getOptionValue("app_attempt_id", ""); 299 appAttemptID = ConverterUtils.toApplicationAttemptId(appIdStr); 300 } 301 else { 302 throw new IllegalArgumentException("Application Attempt Id not set in the environment"); 303 } 304 } else { 305 ContainerId containerId = ConverterUtils.toContainerId(envs.get(ApplicationConstants.AM_CONTAINER_ID_ENV)); 306 appAttemptID = containerId.getApplicationAttemptId(); 307 } 308 309 LOG.info("Application master for app" 310 + ", appId=" + appAttemptID.getApplicationId().getId() 311 + ", clustertimestamp=" + appAttemptID.getApplicationId().getClusterTimestamp() 312 + ", attemptId=" + appAttemptID.getAttemptId()); 313 314 if (!cliParser.hasOption("shell_command")) { 315 throw new IllegalArgumentException("No shell command specified to be executed by application master"); 316 } 317 shellCommand = cliParser.getOptionValue("shell_command"); 318 319 if (cliParser.hasOption("shell_args")) { 320 shellArgs = cliParser.getOptionValue("shell_args"); 321 } 322 if (cliParser.hasOption("shell_env")) { 323 String shellEnvs[] = cliParser.getOptionValues("shell_env"); 324 for (String env : shellEnvs) { 325 env = env.trim(); 326 int index = env.indexOf('='); 327 if (index == -1) { 328 shellEnv.put(env, ""); 329 continue; 330 } 331 String key = env.substring(0, index); 332 String val = ""; 333 if (index < (env.length()-1)) { 334 val = env.substring(index+1); 335 } 336 shellEnv.put(key, val); 337 } 338 } 339 340 if (envs.containsKey(DSConstants.DISTRIBUTEDSHELLSCRIPTLOCATION)) { 341 shellScriptPath = envs.get(DSConstants.DISTRIBUTEDSHELLSCRIPTLOCATION); 342 343 if (envs.containsKey(DSConstants.DISTRIBUTEDSHELLSCRIPTTIMESTAMP)) { 344 shellScriptPathTimestamp = Long.valueOf(envs.get(DSConstants.DISTRIBUTEDSHELLSCRIPTTIMESTAMP)); 345 } 346 if (envs.containsKey(DSConstants.DISTRIBUTEDSHELLSCRIPTLEN)) { 347 shellScriptPathLen = Long.valueOf(envs.get(DSConstants.DISTRIBUTEDSHELLSCRIPTLEN)); 348 } 349 350 if (!shellScriptPath.isEmpty() 351 && (shellScriptPathTimestamp <= 0 352 || shellScriptPathLen <= 0)) { 353 LOG.error("Illegal values in env for shell script path" 354 + ", path=" + shellScriptPath 355 + ", len=" + shellScriptPathLen 356 + ", timestamp=" + shellScriptPathTimestamp); 357 throw new IllegalArgumentException("Illegal values in env for shell script path"); 358 } 359 } 360 361 containerMemory = Integer.parseInt(cliParser.getOptionValue("container_memory", "10")); 362 numTotalContainers = Integer.parseInt(cliParser.getOptionValue("num_containers", "1")); 363 requestPriority = Integer.parseInt(cliParser.getOptionValue("priority", "0")); 364 365 return true; 366 } 367 368 /** 369 * Helper function to print usage 370 * @param opts Parsed command line options 371 */ 372 private void printUsage(Options opts) { 373 new HelpFormatter().printHelp("ApplicationMaster", opts); 374 } 375 376 /** 377 * Main run function for the application master 378 * @throws YarnRemoteException 379 */ 380 public boolean run() throws YarnRemoteException { 381 LOG.info("Starting ApplicationMaster"); 382 383 // Connect to ResourceManager 384 resourceManager = connectToRM(); 385 386 // Setup local RPC Server to accept status requests directly from clients 387 // TODO need to setup a protocol for client to be able to communicate to the RPC server 388 // TODO use the rpc port info to register with the RM for the client to send requests to this app master 389 390 // Register self with ResourceManager 391 RegisterApplicationMasterResponse response = registerToRM(); 392 // Dump out information about cluster capability as seen by the resource manager 393 int minMem = response.getMinimumResourceCapability().getMemory(); 394 int maxMem = response.getMaximumResourceCapability().getMemory(); 395 LOG.info("Min mem capabililty of resources in this cluster " + minMem); 396 LOG.info("Max mem capabililty of resources in this cluster " + maxMem); 397 398 // A resource ask has to be atleast the minimum of the capability of the cluster, the value has to be 399 // a multiple of the min value and cannot exceed the max. 400 // If it is not an exact multiple of min, the RM will allocate to the nearest multiple of min 401 if (containerMemory < minMem) { 402 LOG.info("Container memory specified below min threshold of cluster. Using min value." 403 + ", specified=" + containerMemory 404 + ", min=" + minMem); 405 containerMemory = minMem; 406 } 407 else if (containerMemory > maxMem) { 408 LOG.info("Container memory specified above max threshold of cluster. Using max value." 409 + ", specified=" + containerMemory 410 + ", max=" + maxMem); 411 containerMemory = maxMem; 412 } 413 414 // Setup heartbeat emitter 415 // TODO poll RM every now and then with an empty request to let RM know that we are alive 416 // The heartbeat interval after which an AM is timed out by the RM is defined by a config setting: 417 // RM_AM_EXPIRY_INTERVAL_MS with default defined by DEFAULT_RM_AM_EXPIRY_INTERVAL_MS 418 // The allocate calls to the RM count as heartbeats so, for now, this additional heartbeat emitter 419 // is not required. 420 421 // Setup ask for containers from RM 422 // Send request for containers to RM 423 // Until we get our fully allocated quota, we keep on polling RM for containers 424 // Keep looping until all the containers are launched and shell script executed on them 425 // ( regardless of success/failure). 426 427 int loopCounter = -1; 428 429 while (numCompletedContainers.get() < numTotalContainers 430 && !appDone) { 431 loopCounter++; 432 433 // log current state 434 LOG.info("Current application state: loop=" + loopCounter 435 + ", appDone=" + appDone 436 + ", total=" + numTotalContainers 437 + ", requested=" + numRequestedContainers 438 + ", completed=" + numCompletedContainers 439 + ", failed=" + numFailedContainers 440 + ", currentAllocated=" + numAllocatedContainers); 441 442 // Sleep before each loop when asking RM for containers 443 // to avoid flooding RM with spurious requests when it 444 // need not have any available containers 445 // Sleeping for 1000 ms. 446 try { 447 Thread.sleep(1000); 448 } catch (InterruptedException e) { 449 LOG.info("Sleep interrupted " + e.getMessage()); 450 } 451 452 // No. of containers to request 453 // For the first loop, askCount will be equal to total containers needed 454 // From that point on, askCount will always be 0 as current implementation 455 // does not change its ask on container failures. 456 int askCount = numTotalContainers - numRequestedContainers.get(); 457 numRequestedContainers.addAndGet(askCount); 458 459 // Setup request to be sent to RM to allocate containers 460 List<ResourceRequest> resourceReq = new ArrayList<ResourceRequest>(); 461 if (askCount > 0) { 462 ResourceRequest containerAsk = setupContainerAskForRM(askCount); 463 resourceReq.add(containerAsk); 464 } 465 466 // Send the request to RM 467 LOG.info("Asking RM for containers" 468 + ", askCount=" + askCount); 469 AMResponse amResp =sendContainerAskToRM(resourceReq); 470 471 // Retrieve list of allocated containers from the response 472 List<Container> allocatedContainers = amResp.getAllocatedContainers(); 473 LOG.info("Got response from RM for container ask, allocatedCnt=" + allocatedContainers.size()); 474 numAllocatedContainers.addAndGet(allocatedContainers.size()); 475 for (Container allocatedContainer : allocatedContainers) { 476 LOG.info("Launching shell command on a new container." 477 + ", containerId=" + allocatedContainer.getId() 478 + ", containerNode=" + allocatedContainer.getNodeId().getHost() 479 + ":" + allocatedContainer.getNodeId().getPort() 480 + ", containerNodeURI=" + allocatedContainer.getNodeHttpAddress() 481 + ", containerState" + allocatedContainer.getState() 482 + ", containerResourceMemory" + allocatedContainer.getResource().getMemory()); 483 //+ ", containerToken" + allocatedContainer.getContainerToken().getIdentifier().toString()); 484 485 LaunchContainerRunnable runnableLaunchContainer = new LaunchContainerRunnable(allocatedContainer); 486 Thread launchThread = new Thread(runnableLaunchContainer); 487 488 // launch and start the container on a separate thread to keep the main thread unblocked 489 // as all containers may not be allocated at one go. 490 launchThreads.add(launchThread); 491 launchThread.start(); 492 } 493 494 // Check what the current available resources in the cluster are 495 // TODO should we do anything if the available resources are not enough? 496 Resource availableResources = amResp.getAvailableResources(); 497 LOG.info("Current available resources in the cluster " + availableResources); 498 499 // Check the completed containers 500 List<ContainerStatus> completedContainers = amResp.getCompletedContainersStatuses(); 501 LOG.info("Got response from RM for container ask, completedCnt=" + completedContainers.size()); 502 for (ContainerStatus containerStatus : completedContainers) { 503 LOG.info("Got container status for containerID= " + containerStatus.getContainerId() 504 + ", state=" + containerStatus.getState() 505 + ", exitStatus=" + containerStatus.getExitStatus() 506 + ", diagnostics=" + containerStatus.getDiagnostics()); 507 508 // non complete containers should not be here 509 assert(containerStatus.getState() == ContainerState.COMPLETE); 510 511 // increment counters for completed/failed containers 512 int exitStatus = containerStatus.getExitStatus(); 513 if (0 != exitStatus) { 514 // container failed 515 if (-100 != exitStatus) { 516 // shell script failed 517 // counts as completed 518 numCompletedContainers.incrementAndGet(); 519 numFailedContainers.incrementAndGet(); 520 } 521 else { 522 // something else bad happened 523 // app job did not complete for some reason 524 // we should re-try as the container was lost for some reason 525 numAllocatedContainers.decrementAndGet(); 526 numRequestedContainers.decrementAndGet(); 527 // we do not need to release the container as it would be done 528 // by the RM/CM. 529 } 530 } 531 else { 532 // nothing to do 533 // container completed successfully 534 numCompletedContainers.incrementAndGet(); 535 LOG.info("Container completed successfully." 536 + ", containerId=" + containerStatus.getContainerId()); 537 } 538 539 } 540 if (numCompletedContainers.get() == numTotalContainers) { 541 appDone = true; 542 } 543 544 LOG.info("Current application state: loop=" + loopCounter 545 + ", appDone=" + appDone 546 + ", total=" + numTotalContainers 547 + ", requested=" + numRequestedContainers 548 + ", completed=" + numCompletedContainers 549 + ", failed=" + numFailedContainers 550 + ", currentAllocated=" + numAllocatedContainers); 551 552 // TODO 553 // Add a timeout handling layer 554 // for misbehaving shell commands 555 } 556 557 // Join all launched threads 558 // needed for when we time out 559 // and we need to release containers 560 for (Thread launchThread : launchThreads) { 561 try { 562 launchThread.join(10000); 563 } catch (InterruptedException e) { 564 LOG.info("Exception thrown in thread join: " + e.getMessage()); 565 e.printStackTrace(); 566 } 567 } 568 569 // When the application completes, it should send a finish application signal 570 // to the RM 571 LOG.info("Application completed. Signalling finish to RM"); 572 573 FinishApplicationMasterRequest finishReq = Records.newRecord(FinishApplicationMasterRequest.class); 574 finishReq.setAppAttemptId(appAttemptID); 575 boolean isSuccess = true; 576 if (numFailedContainers.get() == 0) { 577 finishReq.setFinishApplicationStatus(FinalApplicationStatus.SUCCEEDED); 578 } 579 else { 580 finishReq.setFinishApplicationStatus(FinalApplicationStatus.FAILED); 581 String diagnostics = "Diagnostics." 582 + ", total=" + numTotalContainers 583 + ", completed=" + numCompletedContainers.get() 584 + ", allocated=" + numAllocatedContainers.get() 585 + ", failed=" + numFailedContainers.get(); 586 finishReq.setDiagnostics(diagnostics); 587 isSuccess = false; 588 } 589 resourceManager.finishApplicationMaster(finishReq); 590 return isSuccess; 591 } 592 593 /** 594 * Thread to connect to the {@link ContainerManager} and 595 * launch the container that will execute the shell command. 596 */ 597 private class LaunchContainerRunnable implements Runnable { 598 599 // Allocated container 600 Container container; 601 // Handle to communicate with ContainerManager 602 ContainerManager cm; 603 604 /** 605 * @param lcontainer Allocated container 606 */ 607 public LaunchContainerRunnable(Container lcontainer) { 608 this.container = lcontainer; 609 } 610 611 /** 612 * Helper function to connect to CM 613 */ 614 private void connectToCM() { 615 LOG.debug("Connecting to ContainerManager for containerid=" + container.getId()); 616 String cmIpPortStr = container.getNodeId().getHost() + ":" 617 + container.getNodeId().getPort(); 618 InetSocketAddress cmAddress = NetUtils.createSocketAddr(cmIpPortStr); 619 LOG.info("Connecting to ContainerManager at " + cmIpPortStr); 620 this.cm = ((ContainerManager) rpc.getProxy(ContainerManager.class, cmAddress, conf)); 621 } 622 623 624 @Override 625 /** 626 * Connects to CM, sets up container launch context 627 * for shell command and eventually dispatches the container 628 * start request to the CM. 629 */ 630 public void run() { 631 // Connect to ContainerManager 632 connectToCM(); 633 634 LOG.info("Setting up container launch container for containerid=" + container.getId()); 635 ContainerLaunchContext ctx = Records.newRecord(ContainerLaunchContext.class); 636 637 ctx.setContainerId(container.getId()); 638 ctx.setResource(container.getResource()); 639 640 String jobUserName = System.getenv(ApplicationConstants.Environment.USER 641 .name()); 642 ctx.setUser(jobUserName); 643 LOG.info("Setting user in ContainerLaunchContext to: " + jobUserName); 644 645 // Set the environment 646 ctx.setEnvironment(shellEnv); 647 648 // Set the local resources 649 Map<String, LocalResource> localResources = new HashMap<String, LocalResource>(); 650 651 // The container for the eventual shell commands needs its own local resources too. 652 // In this scenario, if a shell script is specified, we need to have it copied 653 // and made available to the container. 654 if (!shellScriptPath.isEmpty()) { 655 LocalResource shellRsrc = Records.newRecord(LocalResource.class); 656 shellRsrc.setType(LocalResourceType.FILE); 657 shellRsrc.setVisibility(LocalResourceVisibility.APPLICATION); 658 try { 659 shellRsrc.setResource(ConverterUtils.getYarnUrlFromURI(new URI(shellScriptPath))); 660 } catch (URISyntaxException e) { 661 LOG.error("Error when trying to use shell script path specified in env" 662 + ", path=" + shellScriptPath); 663 e.printStackTrace(); 664 665 // A failure scenario on bad input such as invalid shell script path 666 // We know we cannot continue launching the container 667 // so we should release it. 668 // TODO 669 numCompletedContainers.incrementAndGet(); 670 numFailedContainers.incrementAndGet(); 671 return; 672 } 673 shellRsrc.setTimestamp(shellScriptPathTimestamp); 674 shellRsrc.setSize(shellScriptPathLen); 675 localResources.put(ExecShellStringPath, shellRsrc); 676 } 677 ctx.setLocalResources(localResources); 678 679 // Set the necessary command to execute on the allocated container 680 Vector<CharSequence> vargs = new Vector<CharSequence>(5); 681 682 // Set executable command 683 vargs.add(shellCommand); 684 // Set shell script path 685 if (!shellScriptPath.isEmpty()) { 686 vargs.add(ExecShellStringPath); 687 } 688 689 // Set args for the shell command if any 690 vargs.add(shellArgs); 691 // Add log redirect params 692 // TODO 693 // We should redirect the output to hdfs instead of local logs 694 // so as to be able to look at the final output after the containers 695 // have been released. 696 // Could use a path suffixed with /AppId/AppAttempId/ContainerId/std[out|err] 697 vargs.add("1>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stdout"); 698 vargs.add("2>" + ApplicationConstants.LOG_DIR_EXPANSION_VAR + "/stderr"); 699 700 // Get final commmand 701 StringBuilder command = new StringBuilder(); 702 for (CharSequence str : vargs) { 703 command.append(str).append(" "); 704 } 705 706 List<String> commands = new ArrayList<String>(); 707 commands.add(command.toString()); 708 ctx.setCommands(commands); 709 710 StartContainerRequest startReq = Records.newRecord(StartContainerRequest.class); 711 startReq.setContainerLaunchContext(ctx); 712 try { 713 cm.startContainer(startReq); 714 } catch (YarnRemoteException e) { 715 LOG.info("Start container failed for :" 716 + ", containerId=" + container.getId()); 717 e.printStackTrace(); 718 // TODO do we need to release this container? 719 } 720 721 // Get container status? 722 // Left commented out as the shell scripts are short lived 723 // and we are relying on the status for completed containers from RM to detect status 724 725 // GetContainerStatusRequest statusReq = Records.newRecord(GetContainerStatusRequest.class); 726 // statusReq.setContainerId(container.getId()); 727 // GetContainerStatusResponse statusResp; 728 //try { 729 //statusResp = cm.getContainerStatus(statusReq); 730 // LOG.info("Container Status" 731 // + ", id=" + container.getId() 732 // + ", status=" +statusResp.getStatus()); 733 //} catch (YarnRemoteException e) { 734 //e.printStackTrace(); 735 //} 736 } 737 } 738 739 /** 740 * Connect to the Resource Manager 741 * @return Handle to communicate with the RM 742 */ 743 private AMRMProtocol connectToRM() { 744 YarnConfiguration yarnConf = new YarnConfiguration(conf); 745 InetSocketAddress rmAddress = yarnConf.getSocketAddr( 746 YarnConfiguration.RM_SCHEDULER_ADDRESS, 747 YarnConfiguration.DEFAULT_RM_SCHEDULER_ADDRESS, 748 YarnConfiguration.DEFAULT_RM_SCHEDULER_PORT); 749 LOG.info("Connecting to ResourceManager at " + rmAddress); 750 return ((AMRMProtocol) rpc.getProxy(AMRMProtocol.class, rmAddress, conf)); 751 } 752 753 /** 754 * Register the Application Master to the Resource Manager 755 * @return the registration response from the RM 756 * @throws YarnRemoteException 757 */ 758 private RegisterApplicationMasterResponse registerToRM() throws YarnRemoteException { 759 RegisterApplicationMasterRequest appMasterRequest = Records.newRecord(RegisterApplicationMasterRequest.class); 760 761 // set the required info into the registration request: 762 // application attempt id, 763 // host on which the app master is running 764 // rpc port on which the app master accepts requests from the client 765 // tracking url for the app master 766 appMasterRequest.setApplicationAttemptId(appAttemptID); 767 appMasterRequest.setHost(appMasterHostname); 768 appMasterRequest.setRpcPort(appMasterRpcPort); 769 appMasterRequest.setTrackingUrl(appMasterTrackingUrl); 770 771 return resourceManager.registerApplicationMaster(appMasterRequest); 772 } 773 774 /** 775 * Setup the request that will be sent to the RM for the container ask. 776 * @param numContainers Containers to ask for from RM 777 * @return the setup ResourceRequest to be sent to RM 778 */ 779 private ResourceRequest setupContainerAskForRM(int numContainers) { 780 ResourceRequest request = Records.newRecord(ResourceRequest.class); 781 782 // setup requirements for hosts 783 // whether a particular rack/host is needed 784 // Refer to apis under org.apache.hadoop.net for more 785 // details on how to get figure out rack/host mapping. 786 // using * as any host will do for the distributed shell app 787 request.setHostName("*"); 788 789 // set no. of containers needed 790 request.setNumContainers(numContainers); 791 792 // set the priority for the request 793 Priority pri = Records.newRecord(Priority.class); 794 // TODO - what is the range for priority? how to decide? 795 pri.setPriority(requestPriority); 796 request.setPriority(pri); 797 798 // Set up resource type requirements 799 // For now, only memory is supported so we set memory requirements 800 Resource capability = Records.newRecord(Resource.class); 801 capability.setMemory(containerMemory); 802 request.setCapability(capability); 803 804 return request; 805 } 806 807 /** 808 * Ask RM to allocate given no. of containers to this Application Master 809 * @param requestedContainers Containers to ask for from RM 810 * @return Response from RM to AM with allocated containers 811 * @throws YarnRemoteException 812 */ 813 private AMResponse sendContainerAskToRM(List<ResourceRequest> requestedContainers) 814 throws YarnRemoteException { 815 AllocateRequest req = Records.newRecord(AllocateRequest.class); 816 req.setResponseId(rmRequestID.incrementAndGet()); 817 req.setApplicationAttemptId(appAttemptID); 818 req.addAllAsks(requestedContainers); 819 req.addAllReleases(releasedContainers); 820 req.setProgress((float)numCompletedContainers.get()/numTotalContainers); 821 822 LOG.info("Sending request to RM for containers" 823 + ", requestedSet=" + requestedContainers.size() 824 + ", releasedSet=" + releasedContainers.size() 825 + ", progress=" + req.getProgress()); 826 827 for (ResourceRequest rsrcReq : requestedContainers) { 828 LOG.info("Requested container ask: " + rsrcReq.toString()); 829 } 830 for (ContainerId id : releasedContainers) { 831 LOG.info("Released container, id=" + id.getId()); 832 } 833 834 AllocateResponse resp = resourceManager.allocate(req); 835 return resp.getAMResponse(); 836 } 837 }