View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase;
19  
20  import java.io.IOException;
21  import java.util.ArrayList;
22  import java.util.Comparator;
23  import java.util.HashMap;
24  import java.util.HashSet;
25  import java.util.List;
26  import java.util.Set;
27  import java.util.TreeSet;
28  
29  import org.apache.hadoop.hbase.classification.InterfaceAudience;
30  import org.apache.hadoop.conf.Configuration;
31  import org.apache.hadoop.hbase.ClusterManager.ServiceType;
32  import org.apache.hadoop.hbase.client.HBaseAdmin;
33  import org.apache.hadoop.hbase.client.HConnection;
34  import org.apache.hadoop.hbase.client.HConnectionManager;
35  import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
36  import org.apache.hadoop.hbase.protobuf.generated.AdminProtos;
37  import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.ServerInfo;
38  import org.apache.hadoop.hbase.protobuf.generated.ClientProtos;
39  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.MasterService;
40  import org.apache.hadoop.hbase.util.Bytes;
41  import org.apache.hadoop.hbase.util.Threads;
42  
43  import com.google.common.collect.Sets;
44  
45  /**
46   * Manages the interactions with an already deployed distributed cluster (as opposed to
47   * a pseudo-distributed, or mini/local cluster). This is used by integration and system tests.
48   */
49  @InterfaceAudience.Private
50  public class DistributedHBaseCluster extends HBaseCluster {
51  
52    private HBaseAdmin admin;
53  
54    private ClusterManager clusterManager;
55  
56    public DistributedHBaseCluster(Configuration conf, ClusterManager clusterManager)
57        throws IOException {
58      super(conf);
59      this.clusterManager = clusterManager;
60      this.admin = new HBaseAdmin(conf);
61      this.initialClusterStatus = getClusterStatus();
62    }
63  
64    public void setClusterManager(ClusterManager clusterManager) {
65      this.clusterManager = clusterManager;
66    }
67  
68    public ClusterManager getClusterManager() {
69      return clusterManager;
70    }
71  
72    /**
73     * Returns a ClusterStatus for this HBase cluster
74     * @throws IOException
75     */
76    @Override
77    public ClusterStatus getClusterStatus() throws IOException {
78      return admin.getClusterStatus();
79    }
80  
81    @Override
82    public ClusterStatus getInitialClusterStatus() throws IOException {
83      return initialClusterStatus;
84    }
85  
86    @Override
87    public void close() throws IOException {
88      if (this.admin != null) {
89        admin.close();
90      }
91    }
92  
93    @Override
94    public AdminProtos.AdminService.BlockingInterface getAdminProtocol(ServerName serverName)
95    throws IOException {
96      return admin.getConnection().getAdmin(serverName);
97    }
98  
99    @Override
100   public ClientProtos.ClientService.BlockingInterface getClientProtocol(ServerName serverName)
101   throws IOException {
102     return admin.getConnection().getClient(serverName);
103   }
104 
105   @Override
106   public void startRegionServer(String hostname, int port) throws IOException {
107     LOG.info("Starting RS on: " + hostname);
108     clusterManager.start(ServiceType.HBASE_REGIONSERVER, hostname, port);
109   }
110 
111   @Override
112   public void killRegionServer(ServerName serverName) throws IOException {
113     LOG.info("Aborting RS: " + serverName.getServerName());
114     clusterManager.kill(ServiceType.HBASE_REGIONSERVER,
115       serverName.getHostname(), serverName.getPort());
116   }
117 
118   @Override
119   public void stopRegionServer(ServerName serverName) throws IOException {
120     LOG.info("Stopping RS: " + serverName.getServerName());
121     clusterManager.stop(ServiceType.HBASE_REGIONSERVER,
122       serverName.getHostname(), serverName.getPort());
123   }
124 
125   @Override
126   public void waitForRegionServerToStop(ServerName serverName, long timeout) throws IOException {
127     waitForServiceToStop(ServiceType.HBASE_REGIONSERVER, serverName, timeout);
128   }
129 
130   @Override
131   public void startZkNode(String hostname, int port) throws IOException {
132     LOG.info("Starting Zookeeper node on: " + hostname);
133     clusterManager.start(ServiceType.ZOOKEEPER_SERVER, hostname, port);
134   }
135 
136   @Override
137   public void killZkNode(ServerName serverName) throws IOException {
138     LOG.info("Aborting Zookeeper node on: " + serverName.getServerName());
139     clusterManager.kill(ServiceType.ZOOKEEPER_SERVER,
140       serverName.getHostname(), serverName.getPort());
141   }
142 
143   @Override
144   public void stopZkNode(ServerName serverName) throws IOException {
145     LOG.info("Stopping Zookeeper node: " + serverName.getServerName());
146     clusterManager.stop(ServiceType.ZOOKEEPER_SERVER,
147       serverName.getHostname(), serverName.getPort());
148   }
149 
150   @Override
151   public void waitForZkNodeToStart(ServerName serverName, long timeout) throws IOException {
152     waitForServiceToStart(ServiceType.ZOOKEEPER_SERVER, serverName, timeout);
153   }
154 
155   @Override
156   public void waitForZkNodeToStop(ServerName serverName, long timeout) throws IOException {
157     waitForServiceToStop(ServiceType.ZOOKEEPER_SERVER, serverName, timeout);
158   }
159 
160   @Override
161   public void startDataNode(ServerName serverName) throws IOException {
162     LOG.info("Starting data node on: " + serverName.getServerName());
163     clusterManager.start(ServiceType.HADOOP_DATANODE,
164       serverName.getHostname(), serverName.getPort());
165   }
166 
167   @Override
168   public void killDataNode(ServerName serverName) throws IOException {
169     LOG.info("Aborting data node on: " + serverName.getServerName());
170     clusterManager.kill(ServiceType.HADOOP_DATANODE,
171       serverName.getHostname(), serverName.getPort());
172   }
173 
174   @Override
175   public void stopDataNode(ServerName serverName) throws IOException {
176     LOG.info("Stopping data node on: " + serverName.getServerName());
177     clusterManager.stop(ServiceType.HADOOP_DATANODE,
178       serverName.getHostname(), serverName.getPort());
179   }
180 
181   @Override
182   public void waitForDataNodeToStart(ServerName serverName, long timeout) throws IOException {
183     waitForServiceToStart(ServiceType.HADOOP_DATANODE, serverName, timeout);
184   }
185 
186   @Override
187   public void waitForDataNodeToStop(ServerName serverName, long timeout) throws IOException {
188     waitForServiceToStop(ServiceType.HADOOP_DATANODE, serverName, timeout);
189   }
190 
191   private void waitForServiceToStop(ServiceType service, ServerName serverName, long timeout)
192     throws IOException {
193     LOG.info("Waiting for service: " + service + " to stop: " + serverName.getServerName());
194     long start = System.currentTimeMillis();
195 
196     while ((System.currentTimeMillis() - start) < timeout) {
197       if (!clusterManager.isRunning(service, serverName.getHostname(), serverName.getPort())) {
198         return;
199       }
200       Threads.sleep(100);
201     }
202     throw new IOException("did timeout waiting for service to stop:" + serverName);
203   }
204 
205   private void waitForServiceToStart(ServiceType service, ServerName serverName, long timeout)
206     throws IOException {
207     LOG.info("Waiting for service: " + service + " to start: " + serverName.getServerName());
208     long start = System.currentTimeMillis();
209 
210     while ((System.currentTimeMillis() - start) < timeout) {
211       if (clusterManager.isRunning(service, serverName.getHostname(), serverName.getPort())) {
212         return;
213       }
214       Threads.sleep(100);
215     }
216     throw new IOException("did timeout waiting for service to start:" + serverName);
217   }
218 
219 
220   @Override
221   public MasterService.BlockingInterface getMaster()
222   throws IOException {
223     HConnection conn = HConnectionManager.getConnection(conf);
224     return conn.getMaster();
225   }
226 
227   @Override
228   public void startMaster(String hostname, int port) throws IOException {
229     LOG.info("Starting Master on: " + hostname + ":" + port);
230     clusterManager.start(ServiceType.HBASE_MASTER, hostname, port);
231   }
232 
233   @Override
234   public void killMaster(ServerName serverName) throws IOException {
235     LOG.info("Aborting Master: " + serverName.getServerName());
236     clusterManager.kill(ServiceType.HBASE_MASTER, serverName.getHostname(), serverName.getPort());
237   }
238 
239   @Override
240   public void stopMaster(ServerName serverName) throws IOException {
241     LOG.info("Stopping Master: " + serverName.getServerName());
242     clusterManager.stop(ServiceType.HBASE_MASTER, serverName.getHostname(), serverName.getPort());
243   }
244 
245   @Override
246   public void waitForMasterToStop(ServerName serverName, long timeout) throws IOException {
247     waitForServiceToStop(ServiceType.HBASE_MASTER, serverName, timeout);
248   }
249 
250   @Override
251   public boolean waitForActiveAndReadyMaster(long timeout) throws IOException {
252     long start = System.currentTimeMillis();
253     while (System.currentTimeMillis() - start < timeout) {
254       try {
255         getMaster();
256         return true;
257       } catch (MasterNotRunningException m) {
258         LOG.warn("Master not started yet " + m);
259       } catch (ZooKeeperConnectionException e) {
260         LOG.warn("Failed to connect to ZK " + e);
261       }
262       Threads.sleep(1000);
263     }
264     return false;
265   }
266 
267   @Override
268   public ServerName getServerHoldingRegion(byte[] regionName) throws IOException {
269     HConnection connection = admin.getConnection();
270     HRegionLocation regionLoc = connection.locateRegion(regionName);
271     if (regionLoc == null) {
272       LOG.warn("Cannot find region server holding region " + Bytes.toString(regionName)
273           + " for table " + HRegionInfo.getTableName(regionName) + ", start key [" +
274           Bytes.toString(HRegionInfo.getStartKey(regionName)) + "]");
275       return null;
276     }
277 
278     AdminProtos.AdminService.BlockingInterface client =
279       connection.getAdmin(regionLoc.getServerName());
280     ServerInfo info = ProtobufUtil.getServerInfo(client);
281     return ProtobufUtil.toServerName(info.getServerName());
282   }
283 
284   @Override
285   public void waitUntilShutDown() {
286     // Simply wait for a few seconds for now (after issuing serverManager.kill
287     throw new RuntimeException("Not implemented yet");
288   }
289 
290   @Override
291   public void shutdown() throws IOException {
292     // not sure we want this
293     throw new RuntimeException("Not implemented yet");
294   }
295 
296   @Override
297   public boolean isDistributedCluster() {
298     return true;
299   }
300 
301   @Override
302   public boolean restoreClusterStatus(ClusterStatus initial) throws IOException {
303     ClusterStatus current = getClusterStatus();
304 
305     LOG.info("Restoring cluster - started");
306 
307     // do a best effort restore
308     boolean success = true;
309     success = restoreMasters(initial, current) & success;
310     success = restoreRegionServers(initial, current) & success;
311     success = restoreAdmin() & success;
312 
313     LOG.info("Restoring cluster - done");
314     return success;
315   }
316 
317   protected boolean restoreMasters(ClusterStatus initial, ClusterStatus current) {
318     List<IOException> deferred = new ArrayList<IOException>();
319     //check whether current master has changed
320     final ServerName initMaster = initial.getMaster();
321     if (!ServerName.isSameHostnameAndPort(initMaster, current.getMaster())) {
322       LOG.info("Restoring cluster - Initial active master : "
323               + initMaster.getHostAndPort()
324               + " has changed to : "
325               + current.getMaster().getHostAndPort());
326       // If initial master is stopped, start it, before restoring the state.
327       // It will come up as a backup master, if there is already an active master.
328       try {
329         if (!clusterManager.isRunning(ServiceType.HBASE_MASTER,
330                 initMaster.getHostname(), initMaster.getPort())) {
331           LOG.info("Restoring cluster - starting initial active master at:"
332                   + initMaster.getHostAndPort());
333           startMaster(initMaster.getHostname(), initMaster.getPort());
334         }
335 
336         // master has changed, we would like to undo this.
337         // 1. Kill the current backups
338         // 2. Stop current master
339         // 3. Start backup masters
340         for (ServerName currentBackup : current.getBackupMasters()) {
341           if (!ServerName.isSameHostnameAndPort(currentBackup, initMaster)) {
342             LOG.info("Restoring cluster - stopping backup master: " + currentBackup);
343             stopMaster(currentBackup);
344           }
345         }
346         LOG.info("Restoring cluster - stopping active master: " + current.getMaster());
347         stopMaster(current.getMaster());
348         waitForActiveAndReadyMaster(); // wait so that active master takes over
349       } catch (IOException ex) {
350         // if we fail to start the initial active master, we do not want to continue stopping
351         // backup masters. Just keep what we have now
352         deferred.add(ex);
353       }
354 
355       //start backup masters
356       for (ServerName backup : initial.getBackupMasters()) {
357         try {
358           //these are not started in backup mode, but we should already have an active master
359           if (!clusterManager.isRunning(ServiceType.HBASE_MASTER,
360                   backup.getHostname(),
361                   backup.getPort())) {
362             LOG.info("Restoring cluster - starting initial backup master: "
363                     + backup.getHostAndPort());
364             startMaster(backup.getHostname(), backup.getPort());
365           }
366         } catch (IOException ex) {
367           deferred.add(ex);
368         }
369       }
370     } else {
371       //current master has not changed, match up backup masters
372       Set<ServerName> toStart = new TreeSet<ServerName>(new ServerNameIgnoreStartCodeComparator());
373       Set<ServerName> toKill = new TreeSet<ServerName>(new ServerNameIgnoreStartCodeComparator());
374       toStart.addAll(initial.getBackupMasters());
375       toKill.addAll(current.getBackupMasters());
376 
377       for (ServerName server : current.getBackupMasters()) {
378         toStart.remove(server);
379       }
380       for (ServerName server: initial.getBackupMasters()) {
381         toKill.remove(server);
382       }
383 
384       for (ServerName sn:toStart) {
385         try {
386           if(!clusterManager.isRunning(ServiceType.HBASE_MASTER, sn.getHostname(), sn.getPort())) {
387             LOG.info("Restoring cluster - starting initial backup master: " + sn.getHostAndPort());
388             startMaster(sn.getHostname(), sn.getPort());
389           }
390         } catch (IOException ex) {
391           deferred.add(ex);
392         }
393       }
394 
395       for (ServerName sn:toKill) {
396         try {
397           if(clusterManager.isRunning(ServiceType.HBASE_MASTER, sn.getHostname(), sn.getPort())) {
398             LOG.info("Restoring cluster - stopping backup master: " + sn.getHostAndPort());
399             stopMaster(sn);
400           }
401         } catch (IOException ex) {
402           deferred.add(ex);
403         }
404       }
405     }
406     if (!deferred.isEmpty()) {
407       LOG.warn("Restoring cluster - restoring region servers reported "
408               + deferred.size() + " errors:");
409       for (int i=0; i<deferred.size() && i < 3; i++) {
410         LOG.warn(deferred.get(i));
411       }
412     }
413 
414     return deferred.isEmpty();
415   }
416 
417 
418   private static class ServerNameIgnoreStartCodeComparator implements Comparator<ServerName> {
419     @Override
420     public int compare(ServerName o1, ServerName o2) {
421       int compare = o1.getHostname().compareToIgnoreCase(o2.getHostname());
422       if (compare != 0) return compare;
423       compare = o1.getPort() - o2.getPort();
424       if (compare != 0) return compare;
425       return 0;
426     }
427   }
428 
429   protected boolean restoreRegionServers(ClusterStatus initial, ClusterStatus current) {
430     Set<ServerName> toStart = new TreeSet<ServerName>(new ServerNameIgnoreStartCodeComparator());
431     Set<ServerName> toKill = new TreeSet<ServerName>(new ServerNameIgnoreStartCodeComparator());
432     toStart.addAll(initial.getBackupMasters());
433     toKill.addAll(current.getBackupMasters());
434 
435     for (ServerName server : current.getServers()) {
436       toStart.remove(server);
437     }
438     for (ServerName server: initial.getServers()) {
439       toKill.remove(server);
440     }
441 
442     List<IOException> deferred = new ArrayList<IOException>();
443 
444     for(ServerName sn:toStart) {
445       try {
446         if (!clusterManager.isRunning(ServiceType.HBASE_REGIONSERVER,
447                 sn.getHostname(),
448                 sn.getPort())) {
449           LOG.info("Restoring cluster - starting initial region server: " + sn.getHostAndPort());
450           startRegionServer(sn.getHostname(), sn.getPort());
451         }
452       } catch (IOException ex) {
453         deferred.add(ex);
454       }
455     }
456 
457     for(ServerName sn:toKill) {
458       try {
459         if (clusterManager.isRunning(ServiceType.HBASE_REGIONSERVER,
460                 sn.getHostname(),
461                 sn.getPort())) {
462           LOG.info("Restoring cluster - stopping initial region server: " + sn.getHostAndPort());
463           stopRegionServer(sn);
464         }
465       } catch (IOException ex) {
466         deferred.add(ex);
467       }
468     }
469     if (!deferred.isEmpty()) {
470       LOG.warn("Restoring cluster - restoring region servers reported "
471               + deferred.size() + " errors:");
472       for (int i=0; i<deferred.size() && i < 3; i++) {
473         LOG.warn(deferred.get(i));
474       }
475     }
476 
477     return deferred.isEmpty();
478   }
479 
480   protected boolean restoreAdmin() throws IOException {
481     // While restoring above, if the HBase Master which was initially the Active one, was down
482     // and the restore put the cluster back to Initial configuration, HAdmin instance will need
483     // to refresh its connections (otherwise it will return incorrect information) or we can
484     // point it to new instance.
485     try {
486       admin.close();
487     } catch (IOException ioe) {
488       LOG.warn("While closing the old connection", ioe);
489     }
490     this.admin = new HBaseAdmin(conf);
491     LOG.info("Added new HBaseAdmin");
492     return true;
493   }
494 }