View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.master.balancer;
19  
20  import java.util.ArrayList;
21  import java.util.Arrays;
22  import java.util.Collections;
23  import java.util.HashMap;
24  import java.util.List;
25  import java.util.Map;
26  import java.util.NavigableMap;
27  import java.util.Random;
28  import java.util.TreeMap;
29  
30  import org.apache.commons.logging.Log;
31  import org.apache.commons.logging.LogFactory;
32  import org.apache.hadoop.classification.InterfaceAudience;
33  import org.apache.hadoop.hbase.HRegionInfo;
34  import org.apache.hadoop.hbase.ServerName;
35  import org.apache.hadoop.hbase.master.AssignmentManager;
36  import org.apache.hadoop.hbase.master.RegionPlan;
37  
38  import com.google.common.collect.MinMaxPriorityQueue;
39  
40  /**
41   * Makes decisions about the placement and movement of Regions across
42   * RegionServers.
43   *
44   * <p>Cluster-wide load balancing will occur only when there are no regions in
45   * transition and according to a fixed period of a time using {@link #balanceCluster(Map)}.
46   *
47   * <p>Inline region placement with {@link #immediateAssignment} can be used when
48   * the Master needs to handle closed regions that it currently does not have
49   * a destination set for.  This can happen during master failover.
50   *
51   * <p>On cluster startup, bulk assignment can be used to determine
52   * locations for all Regions in a cluster.
53   *
54   * <p>This classes produces plans for the {@link AssignmentManager} to execute.
55   */
56  @InterfaceAudience.Private
57  public class DefaultLoadBalancer extends BaseLoadBalancer {
58    private static final Log LOG = LogFactory.getLog(DefaultLoadBalancer.class);
59    private static final Random RANDOM = new Random(System.currentTimeMillis());
60  
61    private RegionInfoComparator riComparator = new RegionInfoComparator();
62    private RegionPlan.RegionPlanComparator rpComparator = new RegionPlan.RegionPlanComparator();
63  
64  
65    /**
66     * Stores additional per-server information about the regions added/removed
67     * during the run of the balancing algorithm.
68     *
69     * For servers that shed regions, we need to track which regions we have already
70     * shed. <b>nextRegionForUnload</b> contains the index in the list of regions on
71     * the server that is the next to be shed.
72     */
73    static class BalanceInfo {
74  
75      private final int nextRegionForUnload;
76      private int numRegionsAdded;
77  
78      public BalanceInfo(int nextRegionForUnload, int numRegionsAdded) {
79        this.nextRegionForUnload = nextRegionForUnload;
80        this.numRegionsAdded = numRegionsAdded;
81      }
82  
83      int getNextRegionForUnload() {
84        return nextRegionForUnload;
85      }
86  
87      int getNumRegionsAdded() {
88        return numRegionsAdded;
89      }
90  
91      void setNumRegionsAdded(int numAdded) {
92        this.numRegionsAdded = numAdded;
93      }
94    }
95  
96    /**
97     * Generate a global load balancing plan according to the specified map of
98     * server information to the most loaded regions of each server.
99     *
100    * The load balancing invariant is that all servers are within 1 region of the
101    * average number of regions per server.  If the average is an integer number,
102    * all servers will be balanced to the average.  Otherwise, all servers will
103    * have either floor(average) or ceiling(average) regions.
104    *
105    * HBASE-3609 Modeled regionsToMove using Guava's MinMaxPriorityQueue so that
106    *   we can fetch from both ends of the queue. 
107    * At the beginning, we check whether there was empty region server 
108    *   just discovered by Master. If so, we alternately choose new / old
109    *   regions from head / tail of regionsToMove, respectively. This alternation
110    *   avoids clustering young regions on the newly discovered region server.
111    *   Otherwise, we choose new regions from head of regionsToMove.
112    *   
113    * Another improvement from HBASE-3609 is that we assign regions from
114    *   regionsToMove to underloaded servers in round-robin fashion.
115    *   Previously one underloaded server would be filled before we move onto
116    *   the next underloaded server, leading to clustering of young regions.
117    *   
118    * Finally, we randomly shuffle underloaded servers so that they receive
119    *   offloaded regions relatively evenly across calls to balanceCluster().
120    *         
121    * The algorithm is currently implemented as such:
122    *
123    * <ol>
124    * <li>Determine the two valid numbers of regions each server should have,
125    *     <b>MIN</b>=floor(average) and <b>MAX</b>=ceiling(average).
126    *
127    * <li>Iterate down the most loaded servers, shedding regions from each so
128    *     each server hosts exactly <b>MAX</b> regions.  Stop once you reach a
129    *     server that already has &lt;= <b>MAX</b> regions.
130    *     <p>
131    *     Order the regions to move from most recent to least.
132    *
133    * <li>Iterate down the least loaded servers, assigning regions so each server
134    *     has exactly </b>MIN</b> regions.  Stop once you reach a server that
135    *     already has &gt;= <b>MIN</b> regions.
136    *
137    *     Regions being assigned to underloaded servers are those that were shed
138    *     in the previous step.  It is possible that there were not enough
139    *     regions shed to fill each underloaded server to <b>MIN</b>.  If so we
140    *     end up with a number of regions required to do so, <b>neededRegions</b>.
141    *
142    *     It is also possible that we were able to fill each underloaded but ended
143    *     up with regions that were unassigned from overloaded servers but that
144    *     still do not have assignment.
145    *
146    *     If neither of these conditions hold (no regions needed to fill the
147    *     underloaded servers, no regions leftover from overloaded servers),
148    *     we are done and return.  Otherwise we handle these cases below.
149    *
150    * <li>If <b>neededRegions</b> is non-zero (still have underloaded servers),
151    *     we iterate the most loaded servers again, shedding a single server from
152    *     each (this brings them from having <b>MAX</b> regions to having
153    *     <b>MIN</b> regions).
154    *
155    * <li>We now definitely have more regions that need assignment, either from
156    *     the previous step or from the original shedding from overloaded servers.
157    *     Iterate the least loaded servers filling each to <b>MIN</b>.
158    *
159    * <li>If we still have more regions that need assignment, again iterate the
160    *     least loaded servers, this time giving each one (filling them to
161    *     </b>MAX</b>) until we run out.
162    *
163    * <li>All servers will now either host <b>MIN</b> or <b>MAX</b> regions.
164    *
165    *     In addition, any server hosting &gt;= <b>MAX</b> regions is guaranteed
166    *     to end up with <b>MAX</b> regions at the end of the balancing.  This
167    *     ensures the minimal number of regions possible are moved.
168    * </ol>
169    *
170    * TODO: We can at-most reassign the number of regions away from a particular
171    *       server to be how many they report as most loaded.
172    *       Should we just keep all assignment in memory?  Any objections?
173    *       Does this mean we need HeapSize on HMaster?  Or just careful monitor?
174    *       (current thinking is we will hold all assignments in memory)
175    *
176    * @param clusterMap Map of regionservers and their load/region information to
177    *                   a list of their most loaded regions
178    * @return a list of regions to be moved, including source and destination,
179    *         or null if cluster is already balanced
180    */
181   public List<RegionPlan> balanceCluster(
182       Map<ServerName, List<HRegionInfo>> clusterMap) {
183     boolean emptyRegionServerPresent = false;
184     long startTime = System.currentTimeMillis();
185 
186 
187     ClusterLoadState cs = new ClusterLoadState(clusterMap);
188 
189     int numServers = cs.getNumServers();
190     if (numServers == 0) {
191       LOG.debug("numServers=0 so skipping load balancing");
192       return null;
193     }
194     NavigableMap<ServerAndLoad, List<HRegionInfo>> serversByLoad = cs.getServersByLoad();
195 
196     int numRegions = cs.getNumRegions();
197 
198     if (!this.needsBalance(cs)) {
199       // Skipped because no server outside (min,max) range
200       float average = cs.getLoadAverage(); // for logging
201       LOG.info("Skipping load balancing because balanced cluster; " +
202         "servers=" + numServers + " " +
203         "regions=" + numRegions + " average=" + average + " " +
204         "mostloaded=" + serversByLoad.lastKey().getLoad() +
205         " leastloaded=" + serversByLoad.firstKey().getLoad());
206       return null;
207     }
208 
209     int min = numRegions / numServers;
210     int max = numRegions % numServers == 0 ? min : min + 1;
211 
212     // Using to check balance result.
213     StringBuilder strBalanceParam = new StringBuilder();
214     strBalanceParam.append("Balance parameter: numRegions=").append(numRegions)
215         .append(", numServers=").append(numServers).append(", max=").append(max)
216         .append(", min=").append(min);
217     LOG.debug(strBalanceParam.toString());
218 
219     // Balance the cluster
220     // TODO: Look at data block locality or a more complex load to do this
221     MinMaxPriorityQueue<RegionPlan> regionsToMove =
222       MinMaxPriorityQueue.orderedBy(rpComparator).create();
223     List<RegionPlan> regionsToReturn = new ArrayList<RegionPlan>();
224 
225     // Walk down most loaded, pruning each to the max
226     int serversOverloaded = 0;
227     // flag used to fetch regions from head and tail of list, alternately
228     boolean fetchFromTail = false;
229     Map<ServerName, BalanceInfo> serverBalanceInfo =
230       new TreeMap<ServerName, BalanceInfo>();
231     for (Map.Entry<ServerAndLoad, List<HRegionInfo>> server:
232         serversByLoad.descendingMap().entrySet()) {
233       ServerAndLoad sal = server.getKey();
234       int regionCount = sal.getLoad();
235       if (regionCount <= max) {
236         serverBalanceInfo.put(sal.getServerName(), new BalanceInfo(0, 0));
237         break;
238       }
239       serversOverloaded++;
240       List<HRegionInfo> regions = server.getValue();
241       int numToOffload = Math.min(regionCount - max, regions.size());
242       // account for the out-of-band regions which were assigned to this server
243       // after some other region server crashed 
244       Collections.sort(regions, riComparator);
245       int numTaken = 0;
246       for (int i = 0; i <= numToOffload; ) {
247         HRegionInfo hri = regions.get(i); // fetch from head
248         if (fetchFromTail) {
249           hri = regions.get(regions.size() - 1 - i);
250         }
251         i++;
252         // Don't rebalance meta regions.
253         if (hri.isMetaRegion()) continue;
254         regionsToMove.add(new RegionPlan(hri, sal.getServerName(), null));
255         numTaken++;
256         if (numTaken >= numToOffload) break;
257         // fetch in alternate order if there is new region server
258         if (emptyRegionServerPresent) {
259           fetchFromTail = !fetchFromTail;
260         }
261       }
262       serverBalanceInfo.put(sal.getServerName(),
263         new BalanceInfo(numToOffload, (-1)*numTaken));
264     }
265     int totalNumMoved = regionsToMove.size();
266 
267     // Walk down least loaded, filling each to the min
268     int neededRegions = 0; // number of regions needed to bring all up to min
269     fetchFromTail = false;
270 
271     Map<ServerName, Integer> underloadedServers = new HashMap<ServerName, Integer>();
272     float average = (float)numRegions / numServers; // for logging
273     int maxToTake = numRegions - (int)average;
274     for (Map.Entry<ServerAndLoad, List<HRegionInfo>> server:
275         serversByLoad.entrySet()) {
276       if (maxToTake == 0) break; // no more to take
277       int regionCount = server.getKey().getLoad();
278       if (regionCount >= min && regionCount > 0) {
279         continue; // look for other servers which haven't reached min
280       }
281       int regionsToPut = min - regionCount;
282       if (regionsToPut == 0)
283       {
284         regionsToPut = 1;
285         maxToTake--;
286       }
287       underloadedServers.put(server.getKey().getServerName(), regionsToPut);
288     }
289     // number of servers that get new regions
290     int serversUnderloaded = underloadedServers.size();
291     int incr = 1;
292     List<ServerName> sns =
293       Arrays.asList(underloadedServers.keySet().toArray(new ServerName[serversUnderloaded]));
294     Collections.shuffle(sns, RANDOM);
295     while (regionsToMove.size() > 0) {
296       int cnt = 0;
297       int i = incr > 0 ? 0 : underloadedServers.size()-1;
298       for (; i >= 0 && i < underloadedServers.size(); i += incr) {
299         if (regionsToMove.isEmpty()) break;
300         ServerName si = sns.get(i);
301         int numToTake = underloadedServers.get(si);
302         if (numToTake == 0) continue;
303 
304         addRegionPlan(regionsToMove, fetchFromTail, si, regionsToReturn);
305         if (emptyRegionServerPresent) {
306           fetchFromTail = !fetchFromTail;
307         }
308 
309         underloadedServers.put(si, numToTake-1);
310         cnt++;
311         BalanceInfo bi = serverBalanceInfo.get(si);
312         if (bi == null) {
313           bi = new BalanceInfo(0, 0);
314           serverBalanceInfo.put(si, bi);
315         }
316         bi.setNumRegionsAdded(bi.getNumRegionsAdded()+1);
317       }
318       if (cnt == 0) break;
319       // iterates underloadedServers in the other direction
320       incr = -incr;
321     }
322     for (Integer i : underloadedServers.values()) {
323       // If we still want to take some, increment needed
324       neededRegions += i;
325     }
326 
327     // If none needed to fill all to min and none left to drain all to max,
328     // we are done
329     if (neededRegions == 0 && regionsToMove.isEmpty()) {
330       long endTime = System.currentTimeMillis();
331       LOG.info("Calculated a load balance in " + (endTime-startTime) + "ms. " +
332           "Moving " + totalNumMoved + " regions off of " +
333           serversOverloaded + " overloaded servers onto " +
334           serversUnderloaded + " less loaded servers");
335       return regionsToReturn;
336     }
337 
338     // Need to do a second pass.
339     // Either more regions to assign out or servers that are still underloaded
340 
341     // If we need more to fill min, grab one from each most loaded until enough
342     if (neededRegions != 0) {
343       // Walk down most loaded, grabbing one from each until we get enough
344       for (Map.Entry<ServerAndLoad, List<HRegionInfo>> server :
345         serversByLoad.descendingMap().entrySet()) {
346         BalanceInfo balanceInfo =
347           serverBalanceInfo.get(server.getKey().getServerName());
348         int idx =
349           balanceInfo == null ? 0 : balanceInfo.getNextRegionForUnload();
350         if (idx >= server.getValue().size()) break;
351         HRegionInfo region = server.getValue().get(idx);
352         if (region.isMetaRegion()) continue; // Don't move meta regions.
353         regionsToMove.add(new RegionPlan(region, server.getKey().getServerName(), null));
354         totalNumMoved++;
355         if (--neededRegions == 0) {
356           // No more regions needed, done shedding
357           break;
358         }
359       }
360     }
361 
362     // Now we have a set of regions that must be all assigned out
363     // Assign each underloaded up to the min, then if leftovers, assign to max
364 
365     // Walk down least loaded, assigning to each to fill up to min
366     for (Map.Entry<ServerAndLoad, List<HRegionInfo>> server :
367         serversByLoad.entrySet()) {
368       int regionCount = server.getKey().getLoad();
369       if (regionCount >= min) break;
370       BalanceInfo balanceInfo = serverBalanceInfo.get(server.getKey().getServerName());
371       if(balanceInfo != null) {
372         regionCount += balanceInfo.getNumRegionsAdded();
373       }
374       if(regionCount >= min) {
375         continue;
376       }
377       int numToTake = min - regionCount;
378       int numTaken = 0;
379       while(numTaken < numToTake && 0 < regionsToMove.size()) {
380         addRegionPlan(regionsToMove, fetchFromTail,
381           server.getKey().getServerName(), regionsToReturn);
382         numTaken++;
383         if (emptyRegionServerPresent) {
384           fetchFromTail = !fetchFromTail;
385         }
386       }
387     }
388 
389     // If we still have regions to dish out, assign underloaded to max
390     if (0 < regionsToMove.size()) {
391       for (Map.Entry<ServerAndLoad, List<HRegionInfo>> server :
392         serversByLoad.entrySet()) {
393         int regionCount = server.getKey().getLoad();
394         if(regionCount >= max) {
395           break;
396         }
397         addRegionPlan(regionsToMove, fetchFromTail,
398           server.getKey().getServerName(), regionsToReturn);
399         if (emptyRegionServerPresent) {
400           fetchFromTail = !fetchFromTail;
401         }
402         if (regionsToMove.isEmpty()) {
403           break;
404         }
405       }
406     }
407 
408     long endTime = System.currentTimeMillis();
409 
410     if (!regionsToMove.isEmpty() || neededRegions != 0) {
411       // Emit data so can diagnose how balancer went astray.
412       LOG.warn("regionsToMove=" + totalNumMoved +
413         ", numServers=" + numServers + ", serversOverloaded=" + serversOverloaded +
414         ", serversUnderloaded=" + serversUnderloaded);
415       StringBuilder sb = new StringBuilder();
416       for (Map.Entry<ServerName, List<HRegionInfo>> e: clusterMap.entrySet()) {
417         if (sb.length() > 0) sb.append(", ");
418         sb.append(e.getKey().toString());
419         sb.append(" ");
420         sb.append(e.getValue().size());
421       }
422       LOG.warn("Input " + sb.toString());
423     }
424 
425     // All done!
426     LOG.info("Done. Calculated a load balance in " + (endTime-startTime) + "ms. " +
427         "Moving " + totalNumMoved + " regions off of " +
428         serversOverloaded + " overloaded servers onto " +
429         serversUnderloaded + " less loaded servers");
430 
431     return regionsToReturn;
432   }
433 
434   /**
435    * Add a region from the head or tail to the List of regions to return.
436    */
437   private void addRegionPlan(final MinMaxPriorityQueue<RegionPlan> regionsToMove,
438       final boolean fetchFromTail, final ServerName sn, List<RegionPlan> regionsToReturn) {
439     RegionPlan rp = null;
440     if (!fetchFromTail) rp = regionsToMove.remove();
441     else rp = regionsToMove.removeLast();
442     rp.setDestination(sn);
443     regionsToReturn.add(rp);
444   }
445 }