1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.hadoop.hbase;
19
20 import java.io.IOException;
21 import java.util.ArrayList;
22 import java.util.Comparator;
23 import java.util.HashMap;
24 import java.util.HashSet;
25 import java.util.List;
26 import java.util.Set;
27 import java.util.TreeSet;
28
29 import org.apache.hadoop.hbase.classification.InterfaceAudience;
30 import org.apache.hadoop.conf.Configuration;
31 import org.apache.hadoop.hbase.ClusterManager.ServiceType;
32 import org.apache.hadoop.hbase.client.HBaseAdmin;
33 import org.apache.hadoop.hbase.client.HConnection;
34 import org.apache.hadoop.hbase.client.HConnectionManager;
35 import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
36 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos;
37 import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.ServerInfo;
38 import org.apache.hadoop.hbase.protobuf.generated.ClientProtos;
39 import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.MasterService;
40 import org.apache.hadoop.hbase.util.Bytes;
41 import org.apache.hadoop.hbase.util.Threads;
42
43 import com.google.common.collect.Sets;
44
45
46
47
48
49 @InterfaceAudience.Private
50 public class DistributedHBaseCluster extends HBaseCluster {
51
52 private HBaseAdmin admin;
53
54 private ClusterManager clusterManager;
55
56 public DistributedHBaseCluster(Configuration conf, ClusterManager clusterManager)
57 throws IOException {
58 super(conf);
59 this.clusterManager = clusterManager;
60 this.admin = new HBaseAdmin(conf);
61 this.initialClusterStatus = getClusterStatus();
62 }
63
64 public void setClusterManager(ClusterManager clusterManager) {
65 this.clusterManager = clusterManager;
66 }
67
68 public ClusterManager getClusterManager() {
69 return clusterManager;
70 }
71
72
73
74
75
76 @Override
77 public ClusterStatus getClusterStatus() throws IOException {
78 return admin.getClusterStatus();
79 }
80
81 @Override
82 public ClusterStatus getInitialClusterStatus() throws IOException {
83 return initialClusterStatus;
84 }
85
86 @Override
87 public void close() throws IOException {
88 if (this.admin != null) {
89 admin.close();
90 }
91 }
92
93 @Override
94 public AdminProtos.AdminService.BlockingInterface getAdminProtocol(ServerName serverName)
95 throws IOException {
96 return admin.getConnection().getAdmin(serverName);
97 }
98
99 @Override
100 public ClientProtos.ClientService.BlockingInterface getClientProtocol(ServerName serverName)
101 throws IOException {
102 return admin.getConnection().getClient(serverName);
103 }
104
105 @Override
106 public void startRegionServer(String hostname, int port) throws IOException {
107 LOG.info("Starting RS on: " + hostname);
108 clusterManager.start(ServiceType.HBASE_REGIONSERVER, hostname, port);
109 }
110
111 @Override
112 public void killRegionServer(ServerName serverName) throws IOException {
113 LOG.info("Aborting RS: " + serverName.getServerName());
114 clusterManager.kill(ServiceType.HBASE_REGIONSERVER,
115 serverName.getHostname(), serverName.getPort());
116 }
117
118 @Override
119 public void stopRegionServer(ServerName serverName) throws IOException {
120 LOG.info("Stopping RS: " + serverName.getServerName());
121 clusterManager.stop(ServiceType.HBASE_REGIONSERVER,
122 serverName.getHostname(), serverName.getPort());
123 }
124
125 @Override
126 public void waitForRegionServerToStop(ServerName serverName, long timeout) throws IOException {
127 waitForServiceToStop(ServiceType.HBASE_REGIONSERVER, serverName, timeout);
128 }
129
130 @Override
131 public void startZkNode(String hostname, int port) throws IOException {
132 LOG.info("Starting Zookeeper node on: " + hostname);
133 clusterManager.start(ServiceType.ZOOKEEPER_SERVER, hostname, port);
134 }
135
136 @Override
137 public void killZkNode(ServerName serverName) throws IOException {
138 LOG.info("Aborting Zookeeper node on: " + serverName.getServerName());
139 clusterManager.kill(ServiceType.ZOOKEEPER_SERVER,
140 serverName.getHostname(), serverName.getPort());
141 }
142
143 @Override
144 public void stopZkNode(ServerName serverName) throws IOException {
145 LOG.info("Stopping Zookeeper node: " + serverName.getServerName());
146 clusterManager.stop(ServiceType.ZOOKEEPER_SERVER,
147 serverName.getHostname(), serverName.getPort());
148 }
149
150 @Override
151 public void waitForZkNodeToStart(ServerName serverName, long timeout) throws IOException {
152 waitForServiceToStart(ServiceType.ZOOKEEPER_SERVER, serverName, timeout);
153 }
154
155 @Override
156 public void waitForZkNodeToStop(ServerName serverName, long timeout) throws IOException {
157 waitForServiceToStop(ServiceType.ZOOKEEPER_SERVER, serverName, timeout);
158 }
159
160 @Override
161 public void startDataNode(ServerName serverName) throws IOException {
162 LOG.info("Starting data node on: " + serverName.getServerName());
163 clusterManager.start(ServiceType.HADOOP_DATANODE,
164 serverName.getHostname(), serverName.getPort());
165 }
166
167 @Override
168 public void killDataNode(ServerName serverName) throws IOException {
169 LOG.info("Aborting data node on: " + serverName.getServerName());
170 clusterManager.kill(ServiceType.HADOOP_DATANODE,
171 serverName.getHostname(), serverName.getPort());
172 }
173
174 @Override
175 public void stopDataNode(ServerName serverName) throws IOException {
176 LOG.info("Stopping data node on: " + serverName.getServerName());
177 clusterManager.stop(ServiceType.HADOOP_DATANODE,
178 serverName.getHostname(), serverName.getPort());
179 }
180
181 @Override
182 public void waitForDataNodeToStart(ServerName serverName, long timeout) throws IOException {
183 waitForServiceToStart(ServiceType.HADOOP_DATANODE, serverName, timeout);
184 }
185
186 @Override
187 public void waitForDataNodeToStop(ServerName serverName, long timeout) throws IOException {
188 waitForServiceToStop(ServiceType.HADOOP_DATANODE, serverName, timeout);
189 }
190
191 private void waitForServiceToStop(ServiceType service, ServerName serverName, long timeout)
192 throws IOException {
193 LOG.info("Waiting for service: " + service + " to stop: " + serverName.getServerName());
194 long start = System.currentTimeMillis();
195
196 while ((System.currentTimeMillis() - start) < timeout) {
197 if (!clusterManager.isRunning(service, serverName.getHostname(), serverName.getPort())) {
198 return;
199 }
200 Threads.sleep(100);
201 }
202 throw new IOException("did timeout waiting for service to stop:" + serverName);
203 }
204
205 private void waitForServiceToStart(ServiceType service, ServerName serverName, long timeout)
206 throws IOException {
207 LOG.info("Waiting for service: " + service + " to start: " + serverName.getServerName());
208 long start = System.currentTimeMillis();
209
210 while ((System.currentTimeMillis() - start) < timeout) {
211 if (clusterManager.isRunning(service, serverName.getHostname(), serverName.getPort())) {
212 return;
213 }
214 Threads.sleep(100);
215 }
216 throw new IOException("did timeout waiting for service to start:" + serverName);
217 }
218
219
220 @Override
221 public MasterService.BlockingInterface getMaster()
222 throws IOException {
223 HConnection conn = HConnectionManager.getConnection(conf);
224 return conn.getMaster();
225 }
226
227 @Override
228 public void startMaster(String hostname, int port) throws IOException {
229 LOG.info("Starting Master on: " + hostname + ":" + port);
230 clusterManager.start(ServiceType.HBASE_MASTER, hostname, port);
231 }
232
233 @Override
234 public void killMaster(ServerName serverName) throws IOException {
235 LOG.info("Aborting Master: " + serverName.getServerName());
236 clusterManager.kill(ServiceType.HBASE_MASTER, serverName.getHostname(), serverName.getPort());
237 }
238
239 @Override
240 public void stopMaster(ServerName serverName) throws IOException {
241 LOG.info("Stopping Master: " + serverName.getServerName());
242 clusterManager.stop(ServiceType.HBASE_MASTER, serverName.getHostname(), serverName.getPort());
243 }
244
245 @Override
246 public void waitForMasterToStop(ServerName serverName, long timeout) throws IOException {
247 waitForServiceToStop(ServiceType.HBASE_MASTER, serverName, timeout);
248 }
249
250 @Override
251 public boolean waitForActiveAndReadyMaster(long timeout) throws IOException {
252 long start = System.currentTimeMillis();
253 while (System.currentTimeMillis() - start < timeout) {
254 try {
255 getMaster();
256 return true;
257 } catch (MasterNotRunningException m) {
258 LOG.warn("Master not started yet " + m);
259 } catch (ZooKeeperConnectionException e) {
260 LOG.warn("Failed to connect to ZK " + e);
261 }
262 Threads.sleep(1000);
263 }
264 return false;
265 }
266
267 @Override
268 public ServerName getServerHoldingRegion(byte[] regionName) throws IOException {
269 HConnection connection = admin.getConnection();
270 HRegionLocation regionLoc = connection.locateRegion(regionName);
271 if (regionLoc == null) {
272 LOG.warn("Cannot find region server holding region " + Bytes.toString(regionName)
273 + " for table " + HRegionInfo.getTableName(regionName) + ", start key [" +
274 Bytes.toString(HRegionInfo.getStartKey(regionName)) + "]");
275 return null;
276 }
277
278 AdminProtos.AdminService.BlockingInterface client =
279 connection.getAdmin(regionLoc.getServerName());
280 ServerInfo info = ProtobufUtil.getServerInfo(client);
281 return ProtobufUtil.toServerName(info.getServerName());
282 }
283
284 @Override
285 public void waitUntilShutDown() {
286
287 throw new RuntimeException("Not implemented yet");
288 }
289
290 @Override
291 public void shutdown() throws IOException {
292
293 throw new RuntimeException("Not implemented yet");
294 }
295
296 @Override
297 public boolean isDistributedCluster() {
298 return true;
299 }
300
301 @Override
302 public boolean restoreClusterStatus(ClusterStatus initial) throws IOException {
303 ClusterStatus current = getClusterStatus();
304
305 LOG.info("Restoring cluster - started");
306
307
308 boolean success = true;
309 success = restoreMasters(initial, current) & success;
310 success = restoreRegionServers(initial, current) & success;
311 success = restoreAdmin() & success;
312
313 LOG.info("Restoring cluster - done");
314 return success;
315 }
316
317 protected boolean restoreMasters(ClusterStatus initial, ClusterStatus current) {
318 List<IOException> deferred = new ArrayList<IOException>();
319
320 final ServerName initMaster = initial.getMaster();
321 if (!ServerName.isSameHostnameAndPort(initMaster, current.getMaster())) {
322 LOG.info("Restoring cluster - Initial active master : "
323 + initMaster.getHostAndPort()
324 + " has changed to : "
325 + current.getMaster().getHostAndPort());
326
327
328 try {
329 if (!clusterManager.isRunning(ServiceType.HBASE_MASTER,
330 initMaster.getHostname(), initMaster.getPort())) {
331 LOG.info("Restoring cluster - starting initial active master at:"
332 + initMaster.getHostAndPort());
333 startMaster(initMaster.getHostname(), initMaster.getPort());
334 }
335
336
337
338
339
340 for (ServerName currentBackup : current.getBackupMasters()) {
341 if (!ServerName.isSameHostnameAndPort(currentBackup, initMaster)) {
342 LOG.info("Restoring cluster - stopping backup master: " + currentBackup);
343 stopMaster(currentBackup);
344 }
345 }
346 LOG.info("Restoring cluster - stopping active master: " + current.getMaster());
347 stopMaster(current.getMaster());
348 waitForActiveAndReadyMaster();
349 } catch (IOException ex) {
350
351
352 deferred.add(ex);
353 }
354
355
356 for (ServerName backup : initial.getBackupMasters()) {
357 try {
358
359 if (!clusterManager.isRunning(ServiceType.HBASE_MASTER,
360 backup.getHostname(),
361 backup.getPort())) {
362 LOG.info("Restoring cluster - starting initial backup master: "
363 + backup.getHostAndPort());
364 startMaster(backup.getHostname(), backup.getPort());
365 }
366 } catch (IOException ex) {
367 deferred.add(ex);
368 }
369 }
370 } else {
371
372 Set<ServerName> toStart = new TreeSet<ServerName>(new ServerNameIgnoreStartCodeComparator());
373 Set<ServerName> toKill = new TreeSet<ServerName>(new ServerNameIgnoreStartCodeComparator());
374 toStart.addAll(initial.getBackupMasters());
375 toKill.addAll(current.getBackupMasters());
376
377 for (ServerName server : current.getBackupMasters()) {
378 toStart.remove(server);
379 }
380 for (ServerName server: initial.getBackupMasters()) {
381 toKill.remove(server);
382 }
383
384 for (ServerName sn:toStart) {
385 try {
386 if(!clusterManager.isRunning(ServiceType.HBASE_MASTER, sn.getHostname(), sn.getPort())) {
387 LOG.info("Restoring cluster - starting initial backup master: " + sn.getHostAndPort());
388 startMaster(sn.getHostname(), sn.getPort());
389 }
390 } catch (IOException ex) {
391 deferred.add(ex);
392 }
393 }
394
395 for (ServerName sn:toKill) {
396 try {
397 if(clusterManager.isRunning(ServiceType.HBASE_MASTER, sn.getHostname(), sn.getPort())) {
398 LOG.info("Restoring cluster - stopping backup master: " + sn.getHostAndPort());
399 stopMaster(sn);
400 }
401 } catch (IOException ex) {
402 deferred.add(ex);
403 }
404 }
405 }
406 if (!deferred.isEmpty()) {
407 LOG.warn("Restoring cluster - restoring region servers reported "
408 + deferred.size() + " errors:");
409 for (int i=0; i<deferred.size() && i < 3; i++) {
410 LOG.warn(deferred.get(i));
411 }
412 }
413
414 return deferred.isEmpty();
415 }
416
417
418 private static class ServerNameIgnoreStartCodeComparator implements Comparator<ServerName> {
419 @Override
420 public int compare(ServerName o1, ServerName o2) {
421 int compare = o1.getHostname().compareToIgnoreCase(o2.getHostname());
422 if (compare != 0) return compare;
423 compare = o1.getPort() - o2.getPort();
424 if (compare != 0) return compare;
425 return 0;
426 }
427 }
428
429 protected boolean restoreRegionServers(ClusterStatus initial, ClusterStatus current) {
430 Set<ServerName> toStart = new TreeSet<ServerName>(new ServerNameIgnoreStartCodeComparator());
431 Set<ServerName> toKill = new TreeSet<ServerName>(new ServerNameIgnoreStartCodeComparator());
432 toStart.addAll(initial.getBackupMasters());
433 toKill.addAll(current.getBackupMasters());
434
435 for (ServerName server : current.getServers()) {
436 toStart.remove(server);
437 }
438 for (ServerName server: initial.getServers()) {
439 toKill.remove(server);
440 }
441
442 List<IOException> deferred = new ArrayList<IOException>();
443
444 for(ServerName sn:toStart) {
445 try {
446 if (!clusterManager.isRunning(ServiceType.HBASE_REGIONSERVER,
447 sn.getHostname(),
448 sn.getPort())) {
449 LOG.info("Restoring cluster - starting initial region server: " + sn.getHostAndPort());
450 startRegionServer(sn.getHostname(), sn.getPort());
451 }
452 } catch (IOException ex) {
453 deferred.add(ex);
454 }
455 }
456
457 for(ServerName sn:toKill) {
458 try {
459 if (clusterManager.isRunning(ServiceType.HBASE_REGIONSERVER,
460 sn.getHostname(),
461 sn.getPort())) {
462 LOG.info("Restoring cluster - stopping initial region server: " + sn.getHostAndPort());
463 stopRegionServer(sn);
464 }
465 } catch (IOException ex) {
466 deferred.add(ex);
467 }
468 }
469 if (!deferred.isEmpty()) {
470 LOG.warn("Restoring cluster - restoring region servers reported "
471 + deferred.size() + " errors:");
472 for (int i=0; i<deferred.size() && i < 3; i++) {
473 LOG.warn(deferred.get(i));
474 }
475 }
476
477 return deferred.isEmpty();
478 }
479
480 protected boolean restoreAdmin() throws IOException {
481
482
483
484
485 try {
486 admin.close();
487 } catch (IOException ioe) {
488 LOG.warn("While closing the old connection", ioe);
489 }
490 this.admin = new HBaseAdmin(conf);
491 LOG.info("Added new HBaseAdmin");
492 return true;
493 }
494 }