View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase;
19  
20  import java.io.IOException;
21  import java.util.ArrayList;
22  import java.util.HashMap;
23  import java.util.Comparator;
24  import java.util.List;
25  
26  import org.apache.hadoop.conf.Configuration;
27  import org.apache.hadoop.hbase.ClusterManager.ServiceType;
28  import org.apache.hadoop.hbase.classification.InterfaceAudience;
29  import org.apache.hadoop.hbase.client.Admin;
30  import org.apache.hadoop.hbase.client.ClusterConnection;
31  import org.apache.hadoop.hbase.client.Connection;
32  import org.apache.hadoop.hbase.client.ConnectionFactory;
33  import org.apache.hadoop.hbase.client.RegionLocator;
34  import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
35  import org.apache.hadoop.hbase.protobuf.generated.AdminProtos;
36  import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.ServerInfo;
37  import org.apache.hadoop.hbase.protobuf.generated.ClientProtos;
38  import org.apache.hadoop.hbase.protobuf.generated.MasterProtos.MasterService;
39  import org.apache.hadoop.hbase.util.Bytes;
40  import org.apache.hadoop.hbase.util.Threads;
41  import com.google.common.collect.Sets;
42  
43  /**
44   * Manages the interactions with an already deployed distributed cluster (as opposed to
45   * a pseudo-distributed, or mini/local cluster). This is used by integration and system tests.
46   */
47  @InterfaceAudience.Private
48  public class DistributedHBaseCluster extends HBaseCluster {
49    private Admin admin;
50    private final Connection connection;
51  
52    private ClusterManager clusterManager;
53  
54    public DistributedHBaseCluster(Configuration conf, ClusterManager clusterManager)
55        throws IOException {
56      super(conf);
57      this.clusterManager = clusterManager;
58      this.connection = ConnectionFactory.createConnection(conf);
59      this.admin = this.connection.getAdmin();
60      this.initialClusterStatus = getClusterStatus();
61    }
62  
63    public void setClusterManager(ClusterManager clusterManager) {
64      this.clusterManager = clusterManager;
65    }
66  
67    public ClusterManager getClusterManager() {
68      return clusterManager;
69    }
70  
71    /**
72     * Returns a ClusterStatus for this HBase cluster
73     * @throws IOException
74     */
75    @Override
76    public ClusterStatus getClusterStatus() throws IOException {
77      return admin.getClusterStatus();
78    }
79  
80    @Override
81    public ClusterStatus getInitialClusterStatus() throws IOException {
82      return initialClusterStatus;
83    }
84  
85    @Override
86    public void close() throws IOException {
87      if (this.admin != null) {
88        admin.close();
89      }
90      if (this.connection != null && !this.connection.isClosed()) {
91        this.connection.close();
92      }
93    }
94  
95    @Override
96    public AdminProtos.AdminService.BlockingInterface getAdminProtocol(ServerName serverName)
97    throws IOException {
98      return ((ClusterConnection)this.connection).getAdmin(serverName);
99    }
100 
101   @Override
102   public ClientProtos.ClientService.BlockingInterface getClientProtocol(ServerName serverName)
103   throws IOException {
104     return ((ClusterConnection)this.connection).getClient(serverName);
105   }
106 
107   @Override
108   public void startRegionServer(String hostname) throws IOException {
109     LOG.info("Starting RS on: " + hostname);
110     clusterManager.start(ServiceType.HBASE_REGIONSERVER, hostname);
111   }
112 
113   @Override
114   public void killRegionServer(ServerName serverName) throws IOException {
115     LOG.info("Aborting RS: " + serverName.getServerName());
116     clusterManager.kill(ServiceType.HBASE_REGIONSERVER, serverName.getHostname());
117   }
118 
119   @Override
120   public void stopRegionServer(ServerName serverName) throws IOException {
121     LOG.info("Stopping RS: " + serverName.getServerName());
122     clusterManager.stop(ServiceType.HBASE_REGIONSERVER, serverName.getHostname());
123   }
124 
125   @Override
126   public void waitForRegionServerToStop(ServerName serverName, long timeout) throws IOException {
127     waitForServiceToStop(ServiceType.HBASE_REGIONSERVER, serverName, timeout);
128   }
129 
130   private void waitForServiceToStop(ServiceType service, ServerName serverName, long timeout)
131     throws IOException {
132     LOG.info("Waiting service:" + service + " to stop: " + serverName.getServerName());
133     long start = System.currentTimeMillis();
134 
135     while ((System.currentTimeMillis() - start) < timeout) {
136       if (!clusterManager.isRunning(service, serverName.getHostname())) {
137         return;
138       }
139       Threads.sleep(1000);
140     }
141     throw new IOException("did timeout waiting for service to stop:" + serverName);
142   }
143 
144   @Override
145   public MasterService.BlockingInterface getMasterAdminService()
146   throws IOException {
147     return ((ClusterConnection)this.connection).getMaster();
148   }
149 
150   @Override
151   public void startMaster(String hostname) throws IOException {
152     LOG.info("Starting Master on: " + hostname);
153     clusterManager.start(ServiceType.HBASE_MASTER, hostname);
154   }
155 
156   @Override
157   public void killMaster(ServerName serverName) throws IOException {
158     LOG.info("Aborting Master: " + serverName.getServerName());
159     clusterManager.kill(ServiceType.HBASE_MASTER, serverName.getHostname());
160   }
161 
162   @Override
163   public void stopMaster(ServerName serverName) throws IOException {
164     LOG.info("Stopping Master: " + serverName.getServerName());
165     clusterManager.stop(ServiceType.HBASE_MASTER, serverName.getHostname());
166   }
167 
168   @Override
169   public void waitForMasterToStop(ServerName serverName, long timeout) throws IOException {
170     waitForServiceToStop(ServiceType.HBASE_MASTER, serverName, timeout);
171   }
172 
173   @Override
174   public boolean waitForActiveAndReadyMaster(long timeout) throws IOException {
175     long start = System.currentTimeMillis();
176     while (System.currentTimeMillis() - start < timeout) {
177       try {
178         getMasterAdminService();
179         return true;
180       } catch (MasterNotRunningException m) {
181         LOG.warn("Master not started yet " + m);
182       } catch (ZooKeeperConnectionException e) {
183         LOG.warn("Failed to connect to ZK " + e);
184       }
185       Threads.sleep(1000);
186     }
187     return false;
188   }
189 
190   @Override
191   public ServerName getServerHoldingRegion(TableName tn, byte[] regionName) throws IOException {
192     HRegionLocation regionLoc = null;
193     try (RegionLocator locator = connection.getRegionLocator(tn)) {
194       regionLoc = locator.getRegionLocation(regionName);
195     }
196     if (regionLoc == null) {
197       LOG.warn("Cannot find region server holding region " + Bytes.toString(regionName) +
198         ", start key [" + Bytes.toString(HRegionInfo.getStartKey(regionName)) + "]");
199       return null;
200     }
201 
202     AdminProtos.AdminService.BlockingInterface client =
203         ((ClusterConnection)this.connection).getAdmin(regionLoc.getServerName());
204     ServerInfo info = ProtobufUtil.getServerInfo(client);
205     return ProtobufUtil.toServerName(info.getServerName());
206   }
207 
208   @Override
209   public void waitUntilShutDown() {
210     //Simply wait for a few seconds for now (after issuing serverManager.kill
211     throw new RuntimeException("Not implemented yet");
212   }
213 
214   @Override
215   public void shutdown() throws IOException {
216     //not sure we want this
217     throw new RuntimeException("Not implemented yet");
218   }
219 
220   @Override
221   public boolean isDistributedCluster() {
222     return true;
223   }
224 
225   @Override
226   public boolean restoreClusterStatus(ClusterStatus initial) throws IOException {
227     ClusterStatus current = getClusterStatus();
228 
229     LOG.info("Restoring cluster - started");
230 
231     // do a best effort restore
232     boolean success = true;
233     success = restoreMasters(initial, current) & success;
234     success = restoreRegionServers(initial, current) & success;
235     success = restoreAdmin() & success;
236 
237     LOG.info("Restoring cluster - done");
238     return success;
239   }
240 
241   protected boolean restoreMasters(ClusterStatus initial, ClusterStatus current) {
242     List<IOException> deferred = new ArrayList<IOException>();
243     //check whether current master has changed
244     if (!ServerName.isSameHostnameAndPort(initial.getMaster(), current.getMaster())) {
245       LOG.info("Restoring cluster - Initial active master : " + initial.getMaster().getHostname()
246           + " has changed to : " + current.getMaster().getHostname());
247       // If initial master is stopped, start it, before restoring the state.
248       // It will come up as a backup master, if there is already an active master.
249       try {
250         if (!clusterManager.isRunning(ServiceType.HBASE_MASTER, initial.getMaster().getHostname())) {
251           LOG.info("Restoring cluster - starting initial active master at:" + initial.getMaster().getHostname());
252           startMaster(initial.getMaster().getHostname());
253         }
254 
255         //master has changed, we would like to undo this.
256         //1. Kill the current backups
257         //2. Stop current master
258         //3. Start backup masters
259         for (ServerName currentBackup : current.getBackupMasters()) {
260           if (!ServerName.isSameHostnameAndPort(currentBackup, initial.getMaster())) {
261             LOG.info("Restoring cluster - stopping backup master: " + currentBackup);
262             stopMaster(currentBackup);
263           }
264         }
265         LOG.info("Restoring cluster - stopping active master: " + current.getMaster());
266         stopMaster(current.getMaster());
267         waitForActiveAndReadyMaster(); //wait so that active master takes over
268       } catch (IOException ex) {
269         // if we fail to start the initial active master, we do not want to continue stopping
270         // backup masters. Just keep what we have now
271         deferred.add(ex);
272       }
273 
274       //start backup masters
275       for (ServerName backup : initial.getBackupMasters()) {
276         try {
277           //these are not started in backup mode, but we should already have an active master
278           if(!clusterManager.isRunning(ServiceType.HBASE_MASTER, backup.getHostname())) {
279             LOG.info("Restoring cluster - starting initial backup master: " + backup.getHostname());
280             startMaster(backup.getHostname());
281           }
282         } catch (IOException ex) {
283           deferred.add(ex);
284         }
285       }
286     } else {
287       //current master has not changed, match up backup masters
288       HashMap<String, ServerName> initialBackups = new HashMap<String, ServerName>();
289       HashMap<String, ServerName> currentBackups = new HashMap<String, ServerName>();
290 
291       for (ServerName server : initial.getBackupMasters()) {
292         initialBackups.put(server.getHostname(), server);
293       }
294       for (ServerName server : current.getBackupMasters()) {
295         currentBackups.put(server.getHostname(), server);
296       }
297 
298       for (String hostname : Sets.difference(initialBackups.keySet(), currentBackups.keySet())) {
299         try {
300           if(!clusterManager.isRunning(ServiceType.HBASE_MASTER, hostname)) {
301             LOG.info("Restoring cluster - starting initial backup master: " + hostname);
302             startMaster(hostname);
303           }
304         } catch (IOException ex) {
305           deferred.add(ex);
306         }
307       }
308 
309       for (String hostname : Sets.difference(currentBackups.keySet(), initialBackups.keySet())) {
310         try {
311           if(clusterManager.isRunning(ServiceType.HBASE_MASTER, hostname)) {
312             LOG.info("Restoring cluster - stopping backup master: " + hostname);
313             stopMaster(currentBackups.get(hostname));
314           }
315         } catch (IOException ex) {
316           deferred.add(ex);
317         }
318       }
319     }
320     if (!deferred.isEmpty()) {
321       LOG.warn("Restoring cluster - restoring region servers reported " + deferred.size() + " errors:");
322       for (int i=0; i<deferred.size() && i < 3; i++) {
323         LOG.warn(deferred.get(i));
324       }
325     }
326 
327     return deferred.isEmpty();
328   }
329 
330   protected boolean restoreRegionServers(ClusterStatus initial, ClusterStatus current) {
331     HashMap<String, ServerName> initialServers = new HashMap<String, ServerName>();
332     HashMap<String, ServerName> currentServers = new HashMap<String, ServerName>();
333 
334     for (ServerName server : initial.getServers()) {
335       initialServers.put(server.getHostname(), server);
336     }
337     for (ServerName server : current.getServers()) {
338       currentServers.put(server.getHostname(), server);
339     }
340 
341     List<IOException> deferred = new ArrayList<IOException>();
342     for (String hostname : Sets.difference(initialServers.keySet(), currentServers.keySet())) {
343       try {
344         if(!clusterManager.isRunning(ServiceType.HBASE_REGIONSERVER, hostname)) {
345           LOG.info("Restoring cluster - starting initial region server: " + hostname);
346           startRegionServer(hostname);
347         }
348       } catch (IOException ex) {
349         deferred.add(ex);
350       }
351     }
352 
353     for (String hostname : Sets.difference(currentServers.keySet(), initialServers.keySet())) {
354       try {
355         if(clusterManager.isRunning(ServiceType.HBASE_REGIONSERVER, hostname)) {
356           LOG.info("Restoring cluster - stopping initial region server: " + hostname);
357           stopRegionServer(currentServers.get(hostname));
358         }
359       } catch (IOException ex) {
360         deferred.add(ex);
361       }
362     }
363     if (!deferred.isEmpty()) {
364       LOG.warn("Restoring cluster - restoring region servers reported " + deferred.size() + " errors:");
365       for (int i=0; i<deferred.size() && i < 3; i++) {
366         LOG.warn(deferred.get(i));
367       }
368     }
369 
370     return deferred.isEmpty();
371   }
372 
373   protected boolean restoreAdmin() throws IOException {
374     // While restoring above, if the HBase Master which was initially the Active one, was down
375     // and the restore put the cluster back to Initial configuration, HAdmin instance will need
376     // to refresh its connections (otherwise it will return incorrect information) or we can
377     // point it to new instance.
378     try {
379       admin.close();
380     } catch (IOException ioe) {
381       LOG.warn("While closing the old connection", ioe);
382     }
383     this.admin = this.connection.getAdmin();
384     LOG.info("Added new HBaseAdmin");
385     return true;
386   }
387 }