View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.master.balancer;
19  
20  import java.util.ArrayList;
21  import java.util.Arrays;
22  import java.util.Collection;
23  import java.util.Comparator;
24  import java.util.Deque;
25  import java.util.HashMap;
26  import java.util.List;
27  import java.util.Map;
28  import java.util.Map.Entry;
29  import java.util.Random;
30  import java.util.Set;
31  import java.util.TreeMap;
32  import java.util.NavigableMap;
33  
34  import org.apache.commons.logging.Log;
35  import org.apache.commons.logging.LogFactory;
36  import org.apache.hadoop.conf.Configuration;
37  import org.apache.hadoop.hbase.ClusterStatus;
38  import org.apache.hadoop.hbase.HBaseIOException;
39  import org.apache.hadoop.hbase.HRegionInfo;
40  import org.apache.hadoop.hbase.RegionLoad;
41  import org.apache.hadoop.hbase.ServerName;
42  import org.apache.hadoop.hbase.master.AssignmentManager;
43  import org.apache.hadoop.hbase.master.LoadBalancer;
44  import org.apache.hadoop.hbase.master.MasterServices;
45  
46  import com.google.common.base.Joiner;
47  import com.google.common.collect.ArrayListMultimap;
48  import com.google.common.collect.Sets;
49  
50  /**
51   * The base class for load balancers. It provides the the functions used to by
52   * {@link AssignmentManager} to assign regions in the edge cases. It doesn't
53   * provide an implementation of the actual balancing algorithm.
54   *
55   */
56  public abstract class BaseLoadBalancer implements LoadBalancer {
57    private static final int MIN_SERVER_BALANCE = 2;
58    private volatile boolean stopped = false;
59  
60    /**
61     * An efficient array based implementation similar to ClusterState for keeping
62     * the status of the cluster in terms of region assignment and distribution.
63     * To be used by LoadBalancers.
64     */
65    protected static class Cluster {
66      ServerName[] servers;
67      ArrayList<String> tables;
68      HRegionInfo[] regions;
69      Deque<RegionLoad>[] regionLoads;
70      int[][] regionLocations; //regionIndex -> list of serverIndex sorted by locality
71  
72      int[][] regionsPerServer;            //serverIndex -> region list
73      int[]   regionIndexToServerIndex;    //regionIndex -> serverIndex
74      int[]   initialRegionIndexToServerIndex;    //regionIndex -> serverIndex (initial cluster state)
75      int[]   regionIndexToTableIndex;     //regionIndex -> tableIndex
76      int[][] numRegionsPerServerPerTable; //serverIndex -> tableIndex -> # regions
77      int[]   numMaxRegionsPerTable;       //tableIndex -> max number of regions in a single RS
78  
79      Integer[] serverIndicesSortedByRegionCount;
80  
81      Map<String, Integer> serversToIndex;
82      Map<String, Integer> tablesToIndex;
83  
84      int numRegions;
85      int numServers;
86      int numTables;
87  
88      int numMovedRegions = 0; //num moved regions from the initial configuration
89      int numMovedMetaRegions = 0;       //num of moved regions that are META
90  
91      protected Cluster(Map<ServerName, List<HRegionInfo>> clusterState,  Map<String, Deque<RegionLoad>> loads,
92          RegionLocationFinder regionFinder) {
93  
94        serversToIndex = new HashMap<String, Integer>();
95        tablesToIndex = new HashMap<String, Integer>();
96        //regionsToIndex = new HashMap<HRegionInfo, Integer>();
97  
98        //TODO: We should get the list of tables from master
99        tables = new ArrayList<String>();
100 
101 
102       numRegions = 0;
103 
104       int serverIndex = 0;
105 
106       // Use servername and port as there can be dead servers in this list. We want everything with
107       // a matching hostname and port to have the same index.
108       for (ServerName sn:clusterState.keySet()) {
109         if (serversToIndex.get(sn.getHostAndPort()) == null) {
110           serversToIndex.put(sn.getHostAndPort(), serverIndex++);
111         }
112       }
113 
114       // Count how many regions there are.
115       for (Entry<ServerName, List<HRegionInfo>> entry : clusterState.entrySet()) {
116         numRegions += entry.getValue().size();
117       }
118 
119       numServers = serversToIndex.size();
120       regionsPerServer = new int[serversToIndex.size()][];
121 
122       servers = new ServerName[numServers];
123       regions = new HRegionInfo[numRegions];
124       regionIndexToServerIndex = new int[numRegions];
125       initialRegionIndexToServerIndex = new int[numRegions];
126       regionIndexToTableIndex = new int[numRegions];
127       regionLoads = new Deque[numRegions];
128       regionLocations = new int[numRegions][];
129       serverIndicesSortedByRegionCount = new Integer[numServers];
130 
131       int tableIndex = 0, regionIndex = 0, regionPerServerIndex = 0;
132 
133       for (Entry<ServerName, List<HRegionInfo>> entry : clusterState.entrySet()) {
134         serverIndex = serversToIndex.get(entry.getKey().getHostAndPort());
135 
136         // keep the servername if this is the first server name for this hostname
137         // or this servername has the newest startcode.
138         if (servers[serverIndex] == null ||
139             servers[serverIndex].getStartcode() < entry.getKey().getStartcode()) {
140           servers[serverIndex] = entry.getKey();
141         }
142 
143         regionsPerServer[serverIndex] = new int[entry.getValue().size()];
144         serverIndicesSortedByRegionCount[serverIndex] = serverIndex;
145       }
146 
147       for (Entry<ServerName, List<HRegionInfo>> entry : clusterState.entrySet()) {
148         serverIndex = serversToIndex.get(entry.getKey().getHostAndPort());
149         regionPerServerIndex = 0;
150 
151         for (HRegionInfo region : entry.getValue()) {
152           String tableName = region.getTable().getNameAsString();
153           Integer idx = tablesToIndex.get(tableName);
154           if (idx == null) {
155             tables.add(tableName);
156             idx = tableIndex;
157             tablesToIndex.put(tableName, tableIndex++);
158           }
159 
160           regions[regionIndex] = region;
161           regionIndexToServerIndex[regionIndex] = serverIndex;
162           initialRegionIndexToServerIndex[regionIndex] = serverIndex;
163           regionIndexToTableIndex[regionIndex] = idx;
164           regionsPerServer[serverIndex][regionPerServerIndex++] = regionIndex;
165 
166           // region load
167           if (loads != null) {
168             Deque<RegionLoad> rl = loads.get(region.getRegionNameAsString());
169             // That could have failed if the RegionLoad is using the other regionName
170             if (rl == null) {
171               // Try getting the region load using encoded name.
172               rl = loads.get(region.getEncodedName());
173             }
174             regionLoads[regionIndex] = rl;
175           }
176 
177           if (regionFinder != null) {
178             //region location
179             List<ServerName> loc = regionFinder.getTopBlockLocations(region);
180             regionLocations[regionIndex] = new int[loc.size()];
181             for (int i=0; i < loc.size(); i++) {
182               regionLocations[regionIndex][i] =
183                   loc.get(i) == null ? -1 :
184                     (serversToIndex.get(loc.get(i)) == null ? -1 : serversToIndex.get(loc.get(i)));
185             }
186           }
187 
188           regionIndex++;
189         }
190       }
191 
192       numTables = tables.size();
193       numRegionsPerServerPerTable = new int[numServers][numTables];
194 
195       for (int i = 0; i < numServers; i++) {
196         for (int j = 0; j < numTables; j++) {
197           numRegionsPerServerPerTable[i][j] = 0;
198         }
199       }
200 
201       for (int i=0; i < regionIndexToServerIndex.length; i++) {
202         numRegionsPerServerPerTable[regionIndexToServerIndex[i]][regionIndexToTableIndex[i]]++;
203       }
204 
205       numMaxRegionsPerTable = new int[numTables];
206       for (serverIndex = 0 ; serverIndex < numRegionsPerServerPerTable.length; serverIndex++) {
207         for (tableIndex = 0 ; tableIndex < numRegionsPerServerPerTable[serverIndex].length; tableIndex++) {
208           if (numRegionsPerServerPerTable[serverIndex][tableIndex] > numMaxRegionsPerTable[tableIndex]) {
209             numMaxRegionsPerTable[tableIndex] = numRegionsPerServerPerTable[serverIndex][tableIndex];
210           }
211         }
212       }
213     }
214 
215     public void moveOrSwapRegion(int lServer, int rServer, int lRegion, int rRegion) {
216       //swap
217       if (rRegion >= 0 && lRegion >= 0) {
218         regionMoved(rRegion, rServer, lServer);
219         regionsPerServer[rServer] = replaceRegion(regionsPerServer[rServer], rRegion, lRegion);
220         regionMoved(lRegion, lServer, rServer);
221         regionsPerServer[lServer] = replaceRegion(regionsPerServer[lServer], lRegion, rRegion);
222       } else if (rRegion >= 0) { //move rRegion
223         regionMoved(rRegion, rServer, lServer);
224         regionsPerServer[rServer] = removeRegion(regionsPerServer[rServer], rRegion);
225         regionsPerServer[lServer] = addRegion(regionsPerServer[lServer], rRegion);
226       } else if (lRegion >= 0) { //move lRegion
227         regionMoved(lRegion, lServer, rServer);
228         regionsPerServer[lServer] = removeRegion(regionsPerServer[lServer], lRegion);
229         regionsPerServer[rServer] = addRegion(regionsPerServer[rServer], lRegion);
230       }
231     }
232 
233     /** Region moved out of the server */
234     void regionMoved(int regionIndex, int oldServerIndex, int newServerIndex) {
235       regionIndexToServerIndex[regionIndex] = newServerIndex;
236       if (initialRegionIndexToServerIndex[regionIndex] == newServerIndex) {
237         numMovedRegions--; //region moved back to original location
238         if (regions[regionIndex].isMetaRegion()) {
239           numMovedMetaRegions--;
240         }
241       } else if (initialRegionIndexToServerIndex[regionIndex] == oldServerIndex) {
242         numMovedRegions++; //region moved from original location
243         if (regions[regionIndex].isMetaRegion()) {
244           numMovedMetaRegions++;
245         }
246       }
247       int tableIndex = regionIndexToTableIndex[regionIndex];
248       numRegionsPerServerPerTable[oldServerIndex][tableIndex]--;
249       numRegionsPerServerPerTable[newServerIndex][tableIndex]++;
250 
251       //check whether this caused maxRegionsPerTable in the new Server to be updated
252       if (numRegionsPerServerPerTable[newServerIndex][tableIndex] > numMaxRegionsPerTable[tableIndex]) {
253         numRegionsPerServerPerTable[newServerIndex][tableIndex] = numMaxRegionsPerTable[tableIndex];
254       } else if ((numRegionsPerServerPerTable[oldServerIndex][tableIndex] + 1)
255           == numMaxRegionsPerTable[tableIndex]) {
256         //recompute maxRegionsPerTable since the previous value was coming from the old server
257         for (int serverIndex = 0 ; serverIndex < numRegionsPerServerPerTable.length; serverIndex++) {
258           if (numRegionsPerServerPerTable[serverIndex][tableIndex] > numMaxRegionsPerTable[tableIndex]) {
259             numMaxRegionsPerTable[tableIndex] = numRegionsPerServerPerTable[serverIndex][tableIndex];
260           }
261         }
262       }
263     }
264 
265     int[] removeRegion(int[] regions, int regionIndex) {
266       //TODO: this maybe costly. Consider using linked lists
267       int[] newRegions = new int[regions.length - 1];
268       int i = 0;
269       for (i = 0; i < regions.length; i++) {
270         if (regions[i] == regionIndex) {
271           break;
272         }
273         newRegions[i] = regions[i];
274       }
275       System.arraycopy(regions, i+1, newRegions, i, newRegions.length - i);
276       return newRegions;
277     }
278 
279     int[] addRegion(int[] regions, int regionIndex) {
280       int[] newRegions = new int[regions.length + 1];
281       System.arraycopy(regions, 0, newRegions, 0, regions.length);
282       newRegions[newRegions.length - 1] = regionIndex;
283       return newRegions;
284     }
285 
286     int[] replaceRegion(int[] regions, int regionIndex, int newRegionIndex) {
287       int i = 0;
288       for (i = 0; i < regions.length; i++) {
289         if (regions[i] == regionIndex) {
290           regions[i] = newRegionIndex;
291           break;
292         }
293       }
294       return regions;
295     }
296 
297     void sortServersByRegionCount() {
298       Arrays.sort(serverIndicesSortedByRegionCount, numRegionsComparator);
299     }
300 
301     int getNumRegions(int server) {
302       return regionsPerServer[server].length;
303     }
304 
305     private Comparator<Integer> numRegionsComparator = new Comparator<Integer>() {
306       @Override
307       public int compare(Integer integer, Integer integer2) {
308         return Integer.valueOf(getNumRegions(integer)).compareTo(getNumRegions(integer2));
309       }
310     };
311 
312     @Override
313     public String toString() {
314       String desc = "Cluster{" +
315           "servers=[";
316           for(ServerName sn:servers) {
317              desc += sn.getHostAndPort() + ", ";
318           }
319           desc +=
320           ", serverIndicesSortedByRegionCount="+
321           Arrays.toString(serverIndicesSortedByRegionCount) +
322           ", regionsPerServer=[";
323 
324           for (int[]r:regionsPerServer) {
325             desc += Arrays.toString(r);
326           }
327           desc += "]" +
328           ", numMaxRegionsPerTable=" +
329           Arrays.toString(numMaxRegionsPerTable) +
330           ", numRegions=" +
331           numRegions +
332           ", numServers=" +
333           numServers +
334           ", numTables=" +
335           numTables +
336           ", numMovedRegions=" +
337           numMovedRegions +
338           ", numMovedMetaRegions=" +
339           numMovedMetaRegions +
340           '}';
341       return desc;
342     }
343   }
344 
345   // slop for regions
346   protected float slop;
347   private Configuration config;
348   private static final Random RANDOM = new Random(System.currentTimeMillis());
349   private static final Log LOG = LogFactory.getLog(BaseLoadBalancer.class);
350 
351   protected final MetricsBalancer metricsBalancer = new MetricsBalancer();
352   protected MasterServices services;
353 
354   @Override
355   public void setConf(Configuration conf) {
356     setSlop(conf);
357     if (slop < 0) slop = 0;
358     else if (slop > 1) slop = 1;
359 
360     this.config = conf;
361   }
362 
363   protected void setSlop(Configuration conf) {
364     this.slop = conf.getFloat("hbase.regions.slop", (float) 0.2);
365   }
366 
367   @Override
368   public Configuration getConf() {
369     return this.config;
370   }
371 
372   public void setClusterStatus(ClusterStatus st) {
373     // Not used except for the StocasticBalancer
374   }
375 
376   public void setMasterServices(MasterServices masterServices) {
377     this.services = masterServices;
378   }
379 
380   protected boolean needsBalance(ClusterLoadState cs) {
381     if (cs.getNumServers() < MIN_SERVER_BALANCE) {
382       if (LOG.isDebugEnabled()) {
383         LOG.debug("Not running balancer because only " + cs.getNumServers()
384             + " active regionserver(s)");
385       }
386       return false;
387     }
388     // Check if we even need to do any load balancing
389     // HBASE-3681 check sloppiness first
390     float average = cs.getLoadAverage(); // for logging
391     int floor = (int) Math.floor(average * (1 - slop));
392     int ceiling = (int) Math.ceil(average * (1 + slop));
393     if (!(cs.getMinLoad() > ceiling || cs.getMaxLoad() < floor)) {
394       NavigableMap<ServerAndLoad, List<HRegionInfo>> serversByLoad = cs.getServersByLoad();
395       if (LOG.isTraceEnabled()) {
396         // If nothing to balance, then don't say anything unless trace-level logging.
397         LOG.trace("Skipping load balancing because balanced cluster; " +
398           "servers=" + cs.getNumServers() + " " +
399           "regions=" + cs.getNumRegions() + " average=" + average + " " +
400           "mostloaded=" + serversByLoad.lastKey().getLoad() +
401           " leastloaded=" + serversByLoad.firstKey().getLoad());
402       }
403       return false;
404     }
405     return true;
406   }
407 
408   /**
409    * Generates a bulk assignment plan to be used on cluster startup using a
410    * simple round-robin assignment.
411    * <p>
412    * Takes a list of all the regions and all the servers in the cluster and
413    * returns a map of each server to the regions that it should be assigned.
414    * <p>
415    * Currently implemented as a round-robin assignment. Same invariant as load
416    * balancing, all servers holding floor(avg) or ceiling(avg).
417    *
418    * TODO: Use block locations from HDFS to place regions with their blocks
419    *
420    * @param regions all regions
421    * @param servers all servers
422    * @return map of server to the regions it should take, or null if no
423    *         assignment is possible (ie. no regions or no servers)
424    */
425   public Map<ServerName, List<HRegionInfo>> roundRobinAssignment(List<HRegionInfo> regions,
426       List<ServerName> servers) {
427     metricsBalancer.incrMiscInvocations();
428 
429     if (regions.isEmpty() || servers.isEmpty()) {
430       return null;
431     }
432     Map<ServerName, List<HRegionInfo>> assignments = new TreeMap<ServerName, List<HRegionInfo>>();
433     int numRegions = regions.size();
434     int numServers = servers.size();
435     int max = (int) Math.ceil((float) numRegions / numServers);
436     int serverIdx = 0;
437     if (numServers > 1) {
438       serverIdx = RANDOM.nextInt(numServers);
439     }
440     int regionIdx = 0;
441     for (int j = 0; j < numServers; j++) {
442       ServerName server = servers.get((j + serverIdx) % numServers);
443       List<HRegionInfo> serverRegions = new ArrayList<HRegionInfo>(max);
444       for (int i = regionIdx; i < numRegions; i += numServers) {
445         serverRegions.add(regions.get(i % numRegions));
446       }
447       assignments.put(server, serverRegions);
448       regionIdx++;
449     }
450     return assignments;
451   }
452 
453   /**
454    * Generates an immediate assignment plan to be used by a new master for
455    * regions in transition that do not have an already known destination.
456    *
457    * Takes a list of regions that need immediate assignment and a list of all
458    * available servers. Returns a map of regions to the server they should be
459    * assigned to.
460    *
461    * This method will return quickly and does not do any intelligent balancing.
462    * The goal is to make a fast decision not the best decision possible.
463    *
464    * Currently this is random.
465    *
466    * @param regions
467    * @param servers
468    * @return map of regions to the server it should be assigned to
469    */
470   public Map<HRegionInfo, ServerName> immediateAssignment(List<HRegionInfo> regions,
471       List<ServerName> servers) {
472     metricsBalancer.incrMiscInvocations();
473 
474     Map<HRegionInfo, ServerName> assignments = new TreeMap<HRegionInfo, ServerName>();
475     for (HRegionInfo region : regions) {
476       assignments.put(region, randomAssignment(region, servers));
477     }
478     return assignments;
479   }
480 
481   /**
482    * Used to assign a single region to a random server.
483    */
484   public ServerName randomAssignment(HRegionInfo regionInfo, List<ServerName> servers) {
485     metricsBalancer.incrMiscInvocations();
486 
487     if (servers == null || servers.isEmpty()) {
488       LOG.warn("Wanted to do random assignment but no servers to assign to");
489       return null;
490     }
491     return servers.get(RANDOM.nextInt(servers.size()));
492   }
493 
494   /**
495    * Generates a bulk assignment startup plan, attempting to reuse the existing
496    * assignment information from META, but adjusting for the specified list of
497    * available/online servers available for assignment.
498    * <p>
499    * Takes a map of all regions to their existing assignment from META. Also
500    * takes a list of online servers for regions to be assigned to. Attempts to
501    * retain all assignment, so in some instances initial assignment will not be
502    * completely balanced.
503    * <p>
504    * Any leftover regions without an existing server to be assigned to will be
505    * assigned randomly to available servers.
506    *
507    * @param regions regions and existing assignment from meta
508    * @param servers available servers
509    * @return map of servers and regions to be assigned to them
510    */
511   public Map<ServerName, List<HRegionInfo>> retainAssignment(Map<HRegionInfo, ServerName> regions,
512       List<ServerName> servers) {
513     // Update metrics
514     metricsBalancer.incrMiscInvocations();
515 
516     // Group all of the old assignments by their hostname.
517     // We can't group directly by ServerName since the servers all have
518     // new start-codes.
519 
520     // Group the servers by their hostname. It's possible we have multiple
521     // servers on the same host on different ports.
522     ArrayListMultimap<String, ServerName> serversByHostname = ArrayListMultimap.create();
523     for (ServerName server : servers) {
524       serversByHostname.put(server.getHostname(), server);
525     }
526 
527     // Now come up with new assignments
528     Map<ServerName, List<HRegionInfo>> assignments = new TreeMap<ServerName, List<HRegionInfo>>();
529 
530     for (ServerName server : servers) {
531       assignments.put(server, new ArrayList<HRegionInfo>());
532     }
533 
534     // Collection of the hostnames that used to have regions
535     // assigned, but for which we no longer have any RS running
536     // after the cluster restart.
537     Set<String> oldHostsNoLongerPresent = Sets.newTreeSet();
538 
539     int numRandomAssignments = 0;
540     int numRetainedAssigments = 0;
541     for (Map.Entry<HRegionInfo, ServerName> entry : regions.entrySet()) {
542       HRegionInfo region = entry.getKey();
543       ServerName oldServerName = entry.getValue();
544       List<ServerName> localServers = new ArrayList<ServerName>();
545       if (oldServerName != null) {
546         localServers = serversByHostname.get(oldServerName.getHostname());
547       }
548       if (localServers.isEmpty()) {
549         // No servers on the new cluster match up with this hostname,
550         // assign randomly.
551         ServerName randomServer = servers.get(RANDOM.nextInt(servers.size()));
552         assignments.get(randomServer).add(region);
553         numRandomAssignments++;
554         if (oldServerName != null) oldHostsNoLongerPresent.add(oldServerName.getHostname());
555       } else if (localServers.size() == 1) {
556         // the usual case - one new server on same host
557         assignments.get(localServers.get(0)).add(region);
558         numRetainedAssigments++;
559       } else {
560         // multiple new servers in the cluster on this same host
561         int size = localServers.size();
562         ServerName target =
563             localServers.contains(oldServerName) ? oldServerName : localServers.get(RANDOM
564                 .nextInt(size));
565         assignments.get(target).add(region);
566         numRetainedAssigments++;
567       }
568     }
569 
570     String randomAssignMsg = "";
571     if (numRandomAssignments > 0) {
572       randomAssignMsg =
573           numRandomAssignments + " regions were assigned "
574               + "to random hosts, since the old hosts for these regions are no "
575               + "longer present in the cluster. These hosts were:\n  "
576               + Joiner.on("\n  ").join(oldHostsNoLongerPresent);
577     }
578 
579     LOG.info("Reassigned " + regions.size() + " regions. " + numRetainedAssigments
580         + " retained the pre-restart assignment. " + randomAssignMsg);
581     return assignments;
582   }
583 
584   @Override
585   public void initialize() throws HBaseIOException{
586   }
587 
588   @Override
589   public boolean isStopped() {
590     return stopped;
591   }
592 
593   @Override
594   public void stop(String why) {
595     LOG.info("Load Balancer stop requested: "+why);
596     stopped = true;
597   }
598 }