View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.master.balancer;
19  
20  import java.util.ArrayDeque;
21  import java.util.Collection;
22  import java.util.Deque;
23  import java.util.HashMap;
24  import java.util.LinkedList;
25  import java.util.List;
26  import java.util.Map;
27  import java.util.Map.Entry;
28  import java.util.Random;
29  
30  import org.apache.commons.logging.Log;
31  import org.apache.commons.logging.LogFactory;
32  import org.apache.commons.math.stat.descriptive.DescriptiveStatistics;
33  import org.apache.hadoop.hbase.classification.InterfaceAudience;
34  import org.apache.hadoop.conf.Configuration;
35  import org.apache.hadoop.hbase.ClusterStatus;
36  import org.apache.hadoop.hbase.HBaseInterfaceAudience;
37  import org.apache.hadoop.hbase.HRegionInfo;
38  import org.apache.hadoop.hbase.RegionLoad;
39  import org.apache.hadoop.hbase.ServerLoad;
40  import org.apache.hadoop.hbase.ServerName;
41  import org.apache.hadoop.hbase.master.MasterServices;
42  import org.apache.hadoop.hbase.master.RegionPlan;
43  import org.apache.hadoop.hbase.util.Bytes;
44  import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
45  import org.apache.hadoop.hbase.util.Pair;
46  
47  /**
48   * <p>This is a best effort load balancer. Given a Cost function F(C) => x It will
49   * randomly try and mutate the cluster to Cprime. If F(Cprime) < F(C) then the
50   * new cluster state becomes the plan. It includes costs functions to compute the cost of:</p>
51   * <ul>
52   * <li>Region Load</li>
53   * <li>Table Load</li>
54   * <li>Data Locality</li>
55   * <li>Memstore Sizes</li>
56   * <li>Storefile Sizes</li>
57   * </ul>
58   *
59   *
60   * <p>Every cost function returns a number between 0 and 1 inclusive; where 0 is the lowest cost
61   * best solution, and 1 is the highest possible cost and the worst solution.  The computed costs are
62   * scaled by their respective multipliers:</p>
63   *
64   * <ul>
65   *   <li>hbase.master.balancer.stochastic.regionLoadCost</li>
66   *   <li>hbase.master.balancer.stochastic.moveCost</li>
67   *   <li>hbase.master.balancer.stochastic.tableLoadCost</li>
68   *   <li>hbase.master.balancer.stochastic.localityCost</li>
69   *   <li>hbase.master.balancer.stochastic.memstoreSizeCost</li>
70   *   <li>hbase.master.balancer.stochastic.storefileSizeCost</li>
71   * </ul>
72   *
73   * <p>In addition to the above configurations, the balancer can be tuned by the following
74   * configuration values:</p>
75   * <ul>
76   *   <li>hbase.master.balancer.stochastic.maxMoveRegions which
77   *   controls what the max number of regions that can be moved in a single invocation of this
78   *   balancer.</li>
79   *   <li>hbase.master.balancer.stochastic.stepsPerRegion is the coefficient by which the number of
80   *   regions is multiplied to try and get the number of times the balancer will
81   *   mutate all servers.</li>
82   *   <li>hbase.master.balancer.stochastic.maxSteps which controls the maximum number of times that
83   *   the balancer will try and mutate all the servers. The balancer will use the minimum of this
84   *   value and the above computation.</li>
85   * </ul>
86   *
87   * <p>This balancer is best used with hbase.master.loadbalance.bytable set to false
88   * so that the balancer gets the full picture of all loads on the cluster.</p>
89   */
90  @InterfaceAudience.LimitedPrivate(HBaseInterfaceAudience.CONFIG)
91  public class StochasticLoadBalancer extends BaseLoadBalancer {
92  
93    private static final String STEPS_PER_REGION_KEY =
94        "hbase.master.balancer.stochastic.stepsPerRegion";
95    private static final String MAX_STEPS_KEY =
96        "hbase.master.balancer.stochastic.maxSteps";
97    private static final String MAX_RUNNING_TIME_KEY =
98        "hbase.master.balancer.stochastic.maxRunningTime";
99    private static final String KEEP_REGION_LOADS =
100       "hbase.master.balancer.stochastic.numRegionLoadsToRemember";
101 
102   private static final Random RANDOM = new Random(System.currentTimeMillis());
103   private static final Log LOG = LogFactory.getLog(StochasticLoadBalancer.class);
104 
105   private final RegionLocationFinder regionFinder = new RegionLocationFinder();
106   private ClusterStatus clusterStatus = null;
107   Map<String, Deque<RegionLoad>> loads = new HashMap<String, Deque<RegionLoad>>();
108 
109   // values are defaults
110   private int maxSteps = 1000000;
111   private int stepsPerRegion = 800;
112   private long maxRunningTime = 30 * 1000 * 1; // 30 seconds.
113   private int numRegionLoadsToRemember = 15;
114 
115   private RegionPicker[] pickers;
116   private CostFromRegionLoadFunction[] regionLoadFunctions;
117   private CostFunction[] costFunctions;
118   // Keep locality based picker and cost function to alert them
119   // when new services are offered
120   private LocalityBasedPicker localityPicker;
121   private LocalityCostFunction localityCost;
122 
123   @Override
124   public void setConf(Configuration conf) {
125     super.setConf(conf);
126 
127     regionFinder.setConf(conf);
128 
129     maxSteps = conf.getInt(MAX_STEPS_KEY, maxSteps);
130 
131     stepsPerRegion = conf.getInt(STEPS_PER_REGION_KEY, stepsPerRegion);
132     maxRunningTime = conf.getLong(MAX_RUNNING_TIME_KEY, maxRunningTime);
133 
134     numRegionLoadsToRemember = conf.getInt(KEEP_REGION_LOADS, numRegionLoadsToRemember);
135 
136     localityPicker = new LocalityBasedPicker(services);
137     localityCost = new LocalityCostFunction(conf, services);
138 
139     pickers = new RegionPicker[] {
140       new RandomRegionPicker(),
141       new LoadPicker(),
142       localityPicker
143     };
144 
145     regionLoadFunctions = new CostFromRegionLoadFunction[] {
146       new ReadRequestCostFunction(conf),
147       new WriteRequestCostFunction(conf),
148       new MemstoreSizeCostFunction(conf),
149       new StoreFileCostFunction(conf)
150     };
151 
152     costFunctions = new CostFunction[]{
153       new RegionCountSkewCostFunction(conf),
154       new MoveCostFunction(conf),
155       localityCost,
156       new TableSkewCostFunction(conf),
157       regionLoadFunctions[0],
158       regionLoadFunctions[1],
159       regionLoadFunctions[2],
160       regionLoadFunctions[3],
161     };
162   }
163 
164   @Override
165   protected void setSlop(Configuration conf) {
166     this.slop = conf.getFloat("hbase.regions.slop", 0.001F);
167   }
168 
169   @Override
170   public void setClusterStatus(ClusterStatus st) {
171     super.setClusterStatus(st);
172     regionFinder.setClusterStatus(st);
173     this.clusterStatus = st;
174     updateRegionLoad();
175     for(CostFromRegionLoadFunction cost : regionLoadFunctions) {
176       cost.setClusterStatus(st);
177     }
178   }
179 
180   @Override
181   public void setMasterServices(MasterServices masterServices) {
182     super.setMasterServices(masterServices);
183     this.regionFinder.setServices(masterServices);
184     this.localityCost.setServices(masterServices);
185     this.localityPicker.setServices(masterServices);
186 
187   }
188 
189   /**
190    * Given the cluster state this will try and approach an optimal balance. This
191    * should always approach the optimal state given enough steps.
192    */
193   @Override
194   public List<RegionPlan> balanceCluster(Map<ServerName, List<HRegionInfo>> clusterState) {
195     if (!needsBalance(new ClusterLoadState(clusterState))) {
196       return null;
197     }
198 
199     long startTime = EnvironmentEdgeManager.currentTimeMillis();
200 
201     // Keep track of servers to iterate through them.
202     Cluster cluster = new Cluster(clusterState, loads, regionFinder);
203     double currentCost = computeCost(cluster, Double.MAX_VALUE);
204 
205     double initCost = currentCost;
206     double newCost = currentCost;
207 
208     long computedMaxSteps = Math.min(this.maxSteps,
209         ((long)cluster.numRegions * (long)this.stepsPerRegion * (long)cluster.numServers));
210     // Perform a stochastic walk to see if we can get a good fit.
211     long step;
212     for (step = 0; step < computedMaxSteps; step++) {
213       int pickerIdx = RANDOM.nextInt(pickers.length);
214       RegionPicker p = pickers[pickerIdx];
215       Pair<Pair<Integer, Integer>, Pair<Integer, Integer>> picks = p.pick(cluster);
216 
217       int leftServer = picks.getFirst().getFirst();
218       int leftRegion = picks.getFirst().getSecond();
219       int rightServer = picks.getSecond().getFirst();
220       int rightRegion = picks.getSecond().getSecond();
221 
222       // We couldn't find a server
223       if (rightServer < 0 || leftServer < 0) {
224         continue;
225       }
226 
227       // We randomly picked to do nothing.
228       if (leftRegion < 0 && rightRegion < 0) {
229         continue;
230       }
231 
232       cluster.moveOrSwapRegion(leftServer,
233           rightServer,
234           leftRegion,
235           rightRegion);
236 
237       newCost = computeCost(cluster, currentCost);
238       // Should this be kept?
239       if (newCost < currentCost) {
240         currentCost = newCost;
241       } else {
242         // Put things back the way they were before.
243         // TODO: undo by remembering old values, using an UndoAction class
244         cluster.moveOrSwapRegion(leftServer,
245             rightServer,
246             rightRegion,
247             leftRegion);
248       }
249 
250       if (EnvironmentEdgeManager.currentTimeMillis() - startTime >
251           maxRunningTime) {
252         break;
253       }
254     }
255 
256     long endTime = EnvironmentEdgeManager.currentTimeMillis();
257 
258     metricsBalancer.balanceCluster(endTime - startTime);
259 
260     if (initCost > currentCost) {
261       List<RegionPlan> plans = createRegionPlans(cluster);
262       if (LOG.isDebugEnabled()) {
263         LOG.debug("Finished computing new load balance plan.  Computation took "
264             + (endTime - startTime) + "ms to try " + step
265             + " different iterations.  Found a solution that moves "
266             + plans.size() + " regions; Going from a computed cost of "
267             + initCost + " to a new cost of " + currentCost);
268       }
269       return plans;
270     }
271     if (LOG.isDebugEnabled()) {
272       LOG.debug("Could not find a better load balance plan.  Tried "
273           + step + " different configurations in " + (endTime - startTime)
274           + "ms, and did not find anything with a computed cost less than " + initCost);
275     }
276     return null;
277   }
278 
279   /**
280    * Create all of the RegionPlan's needed to move from the initial cluster state to the desired
281    * state.
282    *
283    * @param cluster The state of the cluster
284    * @return List of RegionPlan's that represent the moves needed to get to desired final state.
285    */
286   private List<RegionPlan> createRegionPlans(Cluster cluster) {
287     List<RegionPlan> plans = new LinkedList<RegionPlan>();
288     for (int regionIndex = 0;
289          regionIndex < cluster.regionIndexToServerIndex.length; regionIndex++) {
290       int initialServerIndex = cluster.initialRegionIndexToServerIndex[regionIndex];
291       int newServerIndex = cluster.regionIndexToServerIndex[regionIndex];
292 
293       if (initialServerIndex != newServerIndex) {
294         HRegionInfo region = cluster.regions[regionIndex];
295         ServerName initialServer = cluster.servers[initialServerIndex];
296         ServerName newServer = cluster.servers[newServerIndex];
297 
298         if (LOG.isTraceEnabled()) {
299           LOG.trace("Moving Region " + region.getEncodedName() + " from server "
300               + initialServer.getHostname() + " to " + newServer.getHostname());
301         }
302         RegionPlan rp = new RegionPlan(region, initialServer, newServer);
303         plans.add(rp);
304       }
305     }
306     return plans;
307   }
308 
309   /**
310    * Store the current region loads.
311    */
312   private synchronized void updateRegionLoad() {
313     // We create a new hashmap so that regions that are no longer there are removed.
314     // However we temporarily need the old loads so we can use them to keep the rolling average.
315     Map<String, Deque<RegionLoad>> oldLoads = loads;
316     loads = new HashMap<String, Deque<RegionLoad>>();
317 
318     for (ServerName sn : clusterStatus.getServers()) {
319       ServerLoad sl = clusterStatus.getLoad(sn);
320       if (sl == null) {
321         continue;
322       }
323       for (Entry<byte[], RegionLoad> entry : sl.getRegionsLoad().entrySet()) {
324         Deque<RegionLoad> rLoads = oldLoads.get(Bytes.toString(entry.getKey()));
325         if (rLoads == null) {
326           // There was nothing there
327           rLoads = new ArrayDeque<RegionLoad>();
328         } else if (rLoads.size() >= numRegionLoadsToRemember) {
329           rLoads.remove();
330         }
331         rLoads.add(entry.getValue());
332         loads.put(Bytes.toString(entry.getKey()), rLoads);
333 
334       }
335     }
336 
337     for(CostFromRegionLoadFunction cost : regionLoadFunctions) {
338       cost.setLoads(loads);
339     }
340   }
341 
342 
343   /**
344    * This is the main cost function.  It will compute a cost associated with a proposed cluster
345    * state.  All different costs will be combined with their multipliers to produce a double cost.
346    *
347    * @param cluster The state of the cluster
348    * @param previousCost the previous cost. This is used as an early out.
349    * @return a double of a cost associated with the proposed cluster state.  This cost is an
350    *         aggregate of all individual cost functions.
351    */
352   protected double computeCost(Cluster cluster, double previousCost) {
353     double total = 0;
354 
355     for (CostFunction c:costFunctions) {
356       if (c.getMultiplier() <= 0) {
357         continue;
358       }
359 
360       total += c.getMultiplier() * c.cost(cluster);
361 
362       if (total > previousCost) {
363         return total;
364       }
365     }
366     return total;
367   }
368 
369   abstract static class RegionPicker {
370     abstract Pair<Pair<Integer, Integer>, Pair<Integer, Integer>> pick(Cluster cluster);
371 
372     /**
373      * From a list of regions pick a random one. Null can be returned which
374      * {@link StochasticLoadBalancer#balanceCluster(Map)} recognize as signal to try a region move
375      * rather than swap.
376      *
377      * @param cluster        The state of the cluster
378      * @param server         index of the server
379      * @param chanceOfNoSwap Chance that this will decide to try a move rather
380      *                       than a swap.
381      * @return a random {@link HRegionInfo} or null if an asymmetrical move is
382      *         suggested.
383      */
384     protected int pickRandomRegion(Cluster cluster, int server, double chanceOfNoSwap) {
385       // Check to see if this is just a move.
386       if (cluster.regionsPerServer[server].length == 0 || RANDOM.nextFloat() < chanceOfNoSwap) {
387         // signal a move only.
388         return -1;
389       }
390       int rand = RANDOM.nextInt(cluster.regionsPerServer[server].length);
391       return cluster.regionsPerServer[server][rand];
392 
393     }
394     protected int pickRandomServer(Cluster cluster) {
395       if (cluster.numServers < 1) {
396         return -1;
397       }
398 
399       return RANDOM.nextInt(cluster.numServers);
400     }
401     protected int pickOtherRandomServer(Cluster cluster, int serverIndex) {
402       if (cluster.numServers < 2) {
403         return -1;
404       }
405       while (true) {
406         int otherServerIndex = pickRandomServer(cluster);
407         if (otherServerIndex != serverIndex) {
408           return otherServerIndex;
409         }
410       }
411     }
412 
413     protected Pair<Integer, Integer> pickRandomRegions(Cluster cluster,
414                                                        int thisServer,
415                                                        int otherServer) {
416       if (thisServer < 0 || otherServer < 0) {
417         return new Pair<Integer, Integer>(-1, -1);
418       }
419 
420       // Decide who is most likely to need another region
421       int thisRegionCount = cluster.getNumRegions(thisServer);
422       int otherRegionCount = cluster.getNumRegions(otherServer);
423 
424       // Assign the chance based upon the above
425       double thisChance = (thisRegionCount > otherRegionCount) ? 0 : 0.5;
426       double otherChance = (thisRegionCount <= otherRegionCount) ? 0 : 0.5;
427 
428       int thisRegion = pickRandomRegion(cluster, thisServer, thisChance);
429       int otherRegion = pickRandomRegion(cluster, otherServer, otherChance);
430 
431       return new Pair<Integer, Integer>(thisRegion, otherRegion);
432     }
433   }
434 
435   static class RandomRegionPicker extends RegionPicker {
436 
437     @Override
438     Pair<Pair<Integer, Integer>, Pair<Integer, Integer>> pick(Cluster cluster) {
439 
440       int thisServer = pickRandomServer(cluster);
441 
442       // Pick the other server
443       int otherServer = pickOtherRandomServer(cluster, thisServer);
444 
445       Pair<Integer, Integer> regions = pickRandomRegions(cluster, thisServer, otherServer);
446 
447       return new Pair<Pair<Integer, Integer>, Pair<Integer, Integer>>(
448           new Pair<Integer, Integer>(thisServer, regions.getFirst()),
449           new Pair<Integer, Integer>(otherServer, regions.getSecond())
450 
451       );
452     }
453 
454   }
455 
456   public static class LoadPicker extends RegionPicker {
457 
458     @Override
459     Pair<Pair<Integer, Integer>, Pair<Integer, Integer>> pick(Cluster cluster) {
460       cluster.sortServersByRegionCount();
461       int thisServer = pickMostLoadedServer(cluster, -1);
462       int otherServer = pickLeastLoadedServer(cluster, thisServer);
463 
464       Pair<Integer, Integer> regions = pickRandomRegions(cluster, thisServer, otherServer);
465       return new Pair<Pair<Integer, Integer>, Pair<Integer, Integer>>(
466           new Pair<Integer, Integer>(thisServer, regions.getFirst()),
467           new Pair<Integer, Integer>(otherServer, regions.getSecond())
468 
469       );
470     }
471 
472     private int pickLeastLoadedServer(final Cluster cluster, int thisServer) {
473       Integer[] servers = cluster.serverIndicesSortedByRegionCount;
474 
475       int index = 0;
476       while (servers[index] == null || servers[index] == thisServer) {
477         index++;
478         if (index == servers.length) {
479           return -1;
480         }
481       }
482       return servers[index];
483     }
484 
485     private int pickMostLoadedServer(final Cluster cluster, int thisServer) {
486       Integer[] servers = cluster.serverIndicesSortedByRegionCount;
487 
488       int index = servers.length - 1;
489       while (servers[index] == null || servers[index] == thisServer) {
490         index--;
491         if (index < 0) {
492           return -1;
493         }
494       }
495       return servers[index];
496     }
497   }
498 
499   static class LocalityBasedPicker extends RegionPicker {
500 
501     private MasterServices masterServices;
502 
503     LocalityBasedPicker(MasterServices masterServices) {
504       this.masterServices = masterServices;
505     }
506 
507     @Override
508     Pair<Pair<Integer, Integer>, Pair<Integer, Integer>> pick(Cluster cluster) {
509       if (this.masterServices == null) {
510         return new Pair<Pair<Integer, Integer>, Pair<Integer, Integer>>(
511             new Pair<Integer, Integer>(-1,-1),
512             new Pair<Integer, Integer>(-1,-1)
513         );
514       }
515       // Pick a random region server
516       int thisServer = pickRandomServer(cluster);
517 
518       // Pick a random region on this server
519       int thisRegion = pickRandomRegion(cluster, thisServer, 0.0f);
520 
521       if (thisRegion == -1) {
522         return new Pair<Pair<Integer, Integer>, Pair<Integer, Integer>>(
523             new Pair<Integer, Integer>(-1,-1),
524             new Pair<Integer, Integer>(-1,-1)
525         );
526       }
527 
528       // Pick the server with the highest locality
529       int otherServer = pickHighestLocalityServer(cluster, thisServer, thisRegion);
530 
531       // pick an region on the other server to potentially swap
532       int otherRegion = this.pickRandomRegion(cluster, otherServer, 0.5f);
533 
534       return new Pair<Pair<Integer, Integer>, Pair<Integer, Integer>>(
535           new Pair<Integer, Integer>(thisServer,thisRegion),
536           new Pair<Integer, Integer>(otherServer,otherRegion)
537       );
538     }
539 
540     private int pickHighestLocalityServer(Cluster cluster, int thisServer, int thisRegion) {
541       int[] regionLocations = cluster.regionLocations[thisRegion];
542 
543       if (regionLocations == null || regionLocations.length <= 1) {
544         return pickOtherRandomServer(cluster, thisServer);
545       }
546 
547       for (int loc : regionLocations) {
548         if (loc >= 0 && loc != thisServer) { // find the first suitable server
549           return loc;
550         }
551       }
552 
553       // no location found
554       return pickOtherRandomServer(cluster, thisServer);
555     }
556 
557     void setServices(MasterServices services) {
558       this.masterServices = services;
559     }
560   }
561 
562   /**
563    * Base class of StochasticLoadBalancer's Cost Functions.
564    */
565   public abstract static class CostFunction {
566 
567     private float multiplier = 0;
568     private Configuration conf;
569 
570     CostFunction(Configuration c) {
571       this.conf = c;
572     }
573 
574     float getMultiplier() {
575       return multiplier;
576     }
577 
578     void setMultiplier(float m) {
579       this.multiplier = m;
580     }
581 
582     abstract double cost(Cluster cluster);
583 
584     /**
585      * Function to compute a scaled cost using {@link DescriptiveStatistics}. It
586      * assumes that this is a zero sum set of costs.  It assumes that the worst case
587      * possible is all of the elements in one region server and the rest having 0.
588      *
589      * @param stats the costs
590      * @return a scaled set of costs.
591      */
592     protected double costFromArray(double[] stats) {
593       double totalCost = 0;
594       double total = getSum(stats);
595       double mean = total/((double)stats.length);
596       double count = stats.length;
597 
598       // Compute max as if all region servers had 0 and one had the sum of all costs.  This must be
599       // a zero sum cost for this to make sense.
600       double max = ((count - 1) * mean) + (total - mean);
601 
602       // It's possible that there aren't enough regions to go around
603       double min;
604       if (count > total) {
605         min = ((count - total) * mean) + ((1 - mean) * total);
606       } else {
607         // Some will have 1 more than everything else.
608         int numHigh = (int) (total - (Math.floor(mean) * count));
609         int numLow = (int) (count - numHigh);
610 
611         min = (numHigh * (Math.ceil(mean) - mean)) + (numLow * (mean - Math.floor(mean)));
612 
613       }
614       min = Math.max(0, min);
615       for (int i=0; i<stats.length; i++) {
616         double n = stats[i];
617         double diff = Math.abs(mean - n);
618         totalCost += diff;
619       }
620 
621       double scaled =  scale(min, max, totalCost);
622       return scaled;
623     }
624 
625 
626 
627     private double getSum(double[] stats) {
628       double total = 0;
629       for(double s:stats) {
630         total += s;
631       }
632       return total;
633     }
634 
635     /**
636      * Scale the value between 0 and 1.
637      *
638      * @param min   Min value
639      * @param max   The Max value
640      * @param value The value to be scaled.
641      * @return The scaled value.
642      */
643     protected double scale(double min, double max, double value) {
644       if (max == 0 || value == 0) {
645         return 0;
646       }
647       if ((max - min) <= 0) return 0;
648 
649       return Math.max(0d, Math.min(1d, (value - min) / (max - min)));
650     }
651   }
652 
653   /**
654    * Given the starting state of the regions and a potential ending state
655    * compute cost based upon the number of regions that have moved.
656    */
657   public static class MoveCostFunction extends CostFunction {
658     private static final String MOVE_COST_KEY = "hbase.master.balancer.stochastic.moveCost";
659     private static final String MAX_MOVES_PERCENT_KEY =
660         "hbase.master.balancer.stochastic.maxMovePercent";
661     private static final float DEFAULT_MOVE_COST = 100;
662     private static final int DEFAULT_MAX_MOVES = 600;
663     private static final float DEFAULT_MAX_MOVE_PERCENT = 0.25f;
664     private static final int META_MOVE_COST_MULT = 10;
665 
666     private final float maxMovesPercent;
667 
668     MoveCostFunction(Configuration conf) {
669       super(conf);
670 
671       // Move cost multiplier should be the same cost or higher than the rest of the costs to ensure
672       // that large benefits are need to overcome the cost of a move.
673       this.setMultiplier(conf.getFloat(MOVE_COST_KEY, DEFAULT_MOVE_COST));
674       // What percent of the number of regions a single run of the balancer can move.
675       maxMovesPercent = conf.getFloat(MAX_MOVES_PERCENT_KEY, DEFAULT_MAX_MOVE_PERCENT);
676     }
677 
678     @Override
679     double cost(Cluster cluster) {
680       // Try and size the max number of Moves, but always be prepared to move some.
681       int maxMoves = Math.max((int) (cluster.numRegions * maxMovesPercent),
682           DEFAULT_MAX_MOVES);
683 
684       double moveCost = cluster.numMovedRegions;
685 
686       // Don't let this single balance move more than the max moves.
687       // This allows better scaling to accurately represent the actual cost of a move.
688       if (moveCost > maxMoves) {
689         return 1000000;   // return a number much greater than any of the other cost
690       }
691 
692       // hbase:meta region is special
693       if (cluster.numMovedMetaRegions > 0) {
694         // assume each hbase:meta region move costs 10 times
695         moveCost += META_MOVE_COST_MULT * cluster.numMovedMetaRegions;
696       }
697 
698       return scale(0, cluster.numRegions + META_MOVE_COST_MULT, moveCost);
699     }
700   }
701 
702   /**
703    * Compute the cost of a potential cluster state from skew in number of
704    * regions on a cluster.
705    */
706   public static class RegionCountSkewCostFunction extends CostFunction {
707     private static final String REGION_COUNT_SKEW_COST_KEY =
708         "hbase.master.balancer.stochastic.regionCountCost";
709     private static final float DEFAULT_REGION_COUNT_SKEW_COST = 500;
710 
711     private double[] stats = null;
712 
713     RegionCountSkewCostFunction(Configuration conf) {
714       super(conf);
715       // Load multiplier should be the greatest as it is the most general way to balance data.
716       this.setMultiplier(conf.getFloat(REGION_COUNT_SKEW_COST_KEY, DEFAULT_REGION_COUNT_SKEW_COST));
717     }
718 
719     @Override
720     double cost(Cluster cluster) {
721       if (stats == null || stats.length != cluster.numServers) {
722         stats = new double[cluster.numServers];
723       }
724 
725       for (int i =0; i < cluster.numServers; i++) {
726         stats[i] = cluster.regionsPerServer[i].length;
727       }
728 
729       return costFromArray(stats);
730     }
731   }
732 
733   /**
734    * Compute the cost of a potential cluster configuration based upon how evenly
735    * distributed tables are.
736    */
737   public static class TableSkewCostFunction extends CostFunction {
738 
739     private static final String TABLE_SKEW_COST_KEY =
740         "hbase.master.balancer.stochastic.tableSkewCost";
741     private static final float DEFAULT_TABLE_SKEW_COST = 35;
742 
743     TableSkewCostFunction(Configuration conf) {
744       super(conf);
745       this.setMultiplier(conf.getFloat(TABLE_SKEW_COST_KEY, DEFAULT_TABLE_SKEW_COST));
746     }
747 
748     @Override
749     double cost(Cluster cluster) {
750       double max = cluster.numRegions;
751       double min = ((double) cluster.numRegions) / cluster.numServers;
752       double value = 0;
753 
754       for (int i = 0; i < cluster.numMaxRegionsPerTable.length; i++) {
755         value += cluster.numMaxRegionsPerTable[i];
756       }
757 
758       return scale(min, max, value);
759     }
760   }
761 
762 
763   /**
764    * Compute a cost of a potential cluster configuration based upon where
765    * {@link org.apache.hadoop.hbase.regionserver.StoreFile}s are located.
766    */
767   public static class LocalityCostFunction extends CostFunction {
768 
769     private static final String LOCALITY_COST_KEY = "hbase.master.balancer.stochastic.localityCost";
770     private static final float DEFAULT_LOCALITY_COST = 25;
771 
772     private MasterServices services;
773 
774     LocalityCostFunction(Configuration conf, MasterServices srv) {
775       super(conf);
776       this.setMultiplier(conf.getFloat(LOCALITY_COST_KEY, DEFAULT_LOCALITY_COST));
777       this.services = srv;
778     }
779 
780     void setServices(MasterServices srvc) {
781       this.services = srvc;
782     }
783 
784     @Override
785     double cost(Cluster cluster) {
786       double max = 0;
787       double cost = 0;
788 
789       // If there's no master so there's no way anything else works.
790       if (this.services == null) {
791         return cost;
792       }
793 
794       for (int i = 0; i < cluster.regionLocations.length; i++) {
795         max += 1;
796         int serverIndex = cluster.regionIndexToServerIndex[i];
797         int[] regionLocations = cluster.regionLocations[i];
798 
799         // If we can't find where the data is getTopBlock returns null.
800         // so count that as being the best possible.
801         if (regionLocations == null) {
802           continue;
803         }
804 
805         int index = -1;
806         for (int j = 0; j < regionLocations.length; j++) {
807           if (regionLocations[j] >= 0 && regionLocations[j] == serverIndex) {
808             index = j;
809             break;
810           }
811         }
812 
813         if (index < 0) {
814           cost += 1;
815         } else {
816           cost += (double) index / (double) regionLocations.length;
817         }
818       }
819       return scale(0, max, cost);
820     }
821   }
822 
823   /**
824    * Base class the allows writing costs functions from rolling average of some
825    * number from RegionLoad.
826    */
827   public abstract static class CostFromRegionLoadFunction extends CostFunction {
828 
829     private ClusterStatus clusterStatus = null;
830     private Map<String, Deque<RegionLoad>> loads = null;
831     private double[] stats = null;
832     CostFromRegionLoadFunction(Configuration conf) {
833       super(conf);
834     }
835 
836     void setClusterStatus(ClusterStatus status) {
837       this.clusterStatus = status;
838     }
839 
840     void setLoads(Map<String, Deque<RegionLoad>> l) {
841       this.loads = l;
842     }
843 
844 
845     @Override
846     double cost(Cluster cluster) {
847       if (clusterStatus == null || loads == null) {
848         return 0;
849       }
850 
851       if (stats == null || stats.length != cluster.numServers) {
852         stats = new double[cluster.numServers];
853       }
854 
855       for (int i =0; i < stats.length; i++) {
856         //Cost this server has from RegionLoad
857         long cost = 0;
858 
859         // for every region on this server get the rl
860         for(int regionIndex:cluster.regionsPerServer[i]) {
861           Collection<RegionLoad> regionLoadList =  cluster.regionLoads[regionIndex];
862 
863           // Now if we found a region load get the type of cost that was requested.
864           if (regionLoadList != null) {
865             cost += getRegionLoadCost(regionLoadList);
866           }
867         }
868 
869         // Add the total cost to the stats.
870         stats[i] = cost;
871       }
872 
873       // Now return the scaled cost from data held in the stats object.
874       return costFromArray(stats);
875     }
876 
877     protected double getRegionLoadCost(Collection<RegionLoad> regionLoadList) {
878       double cost = 0;
879 
880       for (RegionLoad rl : regionLoadList) {
881         double toAdd = getCostFromRl(rl);
882 
883         if (cost == 0) {
884           cost = toAdd;
885         } else {
886           cost = (.5 * cost) + (.5 * toAdd);
887         }
888       }
889 
890       return cost;
891     }
892 
893     protected abstract double getCostFromRl(RegionLoad rl);
894   }
895 
896   /**
897    * Compute the cost of total number of read requests  The more unbalanced the higher the
898    * computed cost will be.  This uses a rolling average of regionload.
899    */
900 
901   public static class ReadRequestCostFunction extends CostFromRegionLoadFunction {
902 
903     private static final String READ_REQUEST_COST_KEY =
904         "hbase.master.balancer.stochastic.readRequestCost";
905     private static final float DEFAULT_READ_REQUEST_COST = 5;
906 
907     ReadRequestCostFunction(Configuration conf) {
908       super(conf);
909       this.setMultiplier(conf.getFloat(READ_REQUEST_COST_KEY, DEFAULT_READ_REQUEST_COST));
910     }
911 
912 
913     @Override
914     protected double getCostFromRl(RegionLoad rl) {
915       return rl.getReadRequestsCount();
916     }
917   }
918 
919   /**
920    * Compute the cost of total number of write requests.  The more unbalanced the higher the
921    * computed cost will be.  This uses a rolling average of regionload.
922    */
923   public static class WriteRequestCostFunction extends CostFromRegionLoadFunction {
924 
925     private static final String WRITE_REQUEST_COST_KEY =
926         "hbase.master.balancer.stochastic.writeRequestCost";
927     private static final float DEFAULT_WRITE_REQUEST_COST = 5;
928 
929     WriteRequestCostFunction(Configuration conf) {
930       super(conf);
931       this.setMultiplier(conf.getFloat(WRITE_REQUEST_COST_KEY, DEFAULT_WRITE_REQUEST_COST));
932     }
933 
934     @Override
935     protected double getCostFromRl(RegionLoad rl) {
936       return rl.getWriteRequestsCount();
937     }
938   }
939 
940   /**
941    * Compute the cost of total memstore size.  The more unbalanced the higher the
942    * computed cost will be.  This uses a rolling average of regionload.
943    */
944   public static class MemstoreSizeCostFunction extends CostFromRegionLoadFunction {
945 
946     private static final String MEMSTORE_SIZE_COST_KEY =
947         "hbase.master.balancer.stochastic.memstoreSizeCost";
948     private static final float DEFAULT_MEMSTORE_SIZE_COST = 5;
949 
950     MemstoreSizeCostFunction(Configuration conf) {
951       super(conf);
952       this.setMultiplier(conf.getFloat(MEMSTORE_SIZE_COST_KEY, DEFAULT_MEMSTORE_SIZE_COST));
953     }
954 
955     @Override
956     protected double getCostFromRl(RegionLoad rl) {
957       return rl.getMemStoreSizeMB();
958     }
959   }
960   /**
961    * Compute the cost of total open storefiles size.  The more unbalanced the higher the
962    * computed cost will be.  This uses a rolling average of regionload.
963    */
964   public static class StoreFileCostFunction extends CostFromRegionLoadFunction {
965 
966     private static final String STOREFILE_SIZE_COST_KEY =
967         "hbase.master.balancer.stochastic.storefileSizeCost";
968     private static final float DEFAULT_STOREFILE_SIZE_COST = 5;
969 
970     StoreFileCostFunction(Configuration conf) {
971       super(conf);
972       this.setMultiplier(conf.getFloat(STOREFILE_SIZE_COST_KEY, DEFAULT_STOREFILE_SIZE_COST));
973     }
974 
975     @Override
976     protected double getCostFromRl(RegionLoad rl) {
977       return rl.getStorefileSizeMB();
978     }
979   }
980 }