1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.hadoop.hbase.master.balancer;
19
20 import org.apache.commons.lang.mutable.MutableInt;
21 import org.apache.commons.logging.Log;
22 import org.apache.commons.logging.LogFactory;
23 import org.apache.commons.math.stat.descriptive.DescriptiveStatistics;
24 import org.apache.hadoop.classification.InterfaceAudience;
25 import org.apache.hadoop.conf.Configuration;
26 import org.apache.hadoop.hbase.ClusterStatus;
27 import org.apache.hadoop.hbase.HRegionInfo;
28 import org.apache.hadoop.hbase.ServerLoad;
29 import org.apache.hadoop.hbase.RegionLoad;
30 import org.apache.hadoop.hbase.ServerName;
31 import org.apache.hadoop.hbase.master.MasterServices;
32 import org.apache.hadoop.hbase.master.RegionPlan;
33 import org.apache.hadoop.hbase.util.Bytes;
34
35 import java.util.ArrayList;
36 import java.util.HashMap;
37 import java.util.LinkedList;
38 import java.util.List;
39 import java.util.Map;
40 import java.util.Map.Entry;
41 import java.util.Random;
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86 @InterfaceAudience.Private
87 public class StochasticLoadBalancer extends BaseLoadBalancer {
88
89 private static final String STOREFILE_SIZE_COST_KEY =
90 "hbase.master.balancer.stochastic.storefileSizeCost";
91 private static final String MEMSTORE_SIZE_COST_KEY =
92 "hbase.master.balancer.stochastic.memstoreSizeCost";
93 private static final String WRITE_REQUEST_COST_KEY =
94 "hbase.master.balancer.stochastic.writeRequestCost";
95 private static final String READ_REQUEST_COST_KEY =
96 "hbase.master.balancer.stochastic.readRequestCost";
97 private static final String LOCALITY_COST_KEY = "hbase.master.balancer.stochastic.localityCost";
98 private static final String TABLE_LOAD_COST_KEY =
99 "hbase.master.balancer.stochastic.tableLoadCost";
100 private static final String MOVE_COST_KEY = "hbase.master.balancer.stochastic.moveCost";
101 private static final String REGION_LOAD_COST_KEY =
102 "hbase.master.balancer.stochastic.regionLoadCost";
103 private static final String STEPS_PER_REGION_KEY =
104 "hbase.master.balancer.stochastic.stepsPerRegion";
105 private static final String MAX_STEPS_KEY = "hbase.master.balancer.stochastic.maxSteps";
106 private static final String MAX_MOVES_KEY = "hbase.master.balancer.stochastic.maxMoveRegions";
107 private static final String KEEP_REGION_LOADS = "hbase.master.balancer.stochastic.numRegionLoadsToRemember";
108
109 private static final Random RANDOM = new Random(System.currentTimeMillis());
110 private static final Log LOG = LogFactory.getLog(StochasticLoadBalancer.class);
111 private final RegionLocationFinder regionFinder = new RegionLocationFinder();
112 private ClusterStatus clusterStatus = null;
113 private Map<String, List<RegionLoad>> loads = new HashMap<String, List<RegionLoad>>();
114
115
116 private int maxSteps = 15000;
117 private int stepsPerRegion = 110;
118 private int maxMoves = 600;
119 private int numRegionLoadsToRemember = 15;
120 private float loadMultiplier = 55;
121 private float moveCostMultiplier = 5;
122 private float tableMultiplier = 5;
123 private float localityMultiplier = 5;
124 private float readRequestMultiplier = 0;
125 private float writeRequestMultiplier = 0;
126 private float memStoreSizeMultiplier = 5;
127 private float storeFileSizeMultiplier = 5;
128
129
130 @Override
131 public void setConf(Configuration conf) {
132 super.setConf(conf);
133 regionFinder.setConf(conf);
134
135 maxSteps = conf.getInt(MAX_STEPS_KEY, maxSteps);
136 maxMoves = conf.getInt(MAX_MOVES_KEY, maxMoves);
137 stepsPerRegion = conf.getInt(STEPS_PER_REGION_KEY, stepsPerRegion);
138
139 numRegionLoadsToRemember = conf.getInt(KEEP_REGION_LOADS, numRegionLoadsToRemember);
140
141
142 loadMultiplier = conf.getFloat(REGION_LOAD_COST_KEY, loadMultiplier);
143
144
145
146 moveCostMultiplier = conf.getFloat(MOVE_COST_KEY, moveCostMultiplier);
147
148
149
150 tableMultiplier = conf.getFloat(TABLE_LOAD_COST_KEY, tableMultiplier);
151 localityMultiplier = conf.getFloat(LOCALITY_COST_KEY, localityMultiplier);
152 memStoreSizeMultiplier = conf.getFloat(MEMSTORE_SIZE_COST_KEY, memStoreSizeMultiplier);
153 storeFileSizeMultiplier = conf.getFloat(STOREFILE_SIZE_COST_KEY, storeFileSizeMultiplier);
154 readRequestMultiplier = conf.getFloat(READ_REQUEST_COST_KEY, readRequestMultiplier);
155 writeRequestMultiplier = conf.getFloat(WRITE_REQUEST_COST_KEY, writeRequestMultiplier);
156 }
157
158 @Override
159 public void setClusterStatus(ClusterStatus st) {
160 super.setClusterStatus(st);
161 regionFinder.setClusterStatus(st);
162 this.clusterStatus = st;
163 updateRegionLoad();
164 }
165
166 @Override
167 public void setMasterServices(MasterServices masterServices) {
168 super.setMasterServices(masterServices);
169 this.services = masterServices;
170 this.regionFinder.setServices(masterServices);
171 }
172
173
174
175
176
177 @Override
178 public List<RegionPlan> balanceCluster(Map<ServerName, List<HRegionInfo>> clusterState) {
179
180
181 if (clusterState.size() <= 1) {
182 LOG.debug("Skipping load balance as cluster has only one node.");
183 return null;
184 }
185
186 long startTime = System.currentTimeMillis();
187
188
189 List<ServerName> servers = new ArrayList<ServerName>(clusterState.keySet());
190 Map<HRegionInfo, ServerName> initialRegionMapping = createRegionMapping(clusterState);
191 double currentCost, newCost, initCost;
192 currentCost = newCost = initCost = computeCost(initialRegionMapping, clusterState);
193
194 int computedMaxSteps =
195 Math.min(this.maxSteps, (initialRegionMapping.size() * this.stepsPerRegion));
196
197 for (int step = 0; step < computedMaxSteps; step++) {
198
199
200 for (ServerName leftServer : servers) {
201
202
203 ServerName rightServer = pickOtherServer(leftServer, servers);
204 if (rightServer == null) {
205 continue;
206 }
207
208
209 List<HRegionInfo> leftRegionList = clusterState.get(leftServer);
210 List<HRegionInfo> rightRegionList = clusterState.get(rightServer);
211
212
213
214 HRegionInfo lRegion = pickRandomRegion(leftRegionList, 0);
215 HRegionInfo rRegion = pickRandomRegion(rightRegionList, 0.5);
216
217
218 if (lRegion == null && rRegion == null) {
219 continue;
220 }
221
222 if (rRegion != null) {
223 leftRegionList.add(rRegion);
224 }
225
226 if (lRegion != null) {
227 rightRegionList.add(lRegion);
228 }
229
230 newCost = computeCost(initialRegionMapping, clusterState);
231
232
233 if (newCost < currentCost) {
234 currentCost = newCost;
235 } else {
236
237 if (rRegion != null) {
238 leftRegionList.remove(rRegion);
239 rightRegionList.add(rRegion);
240 }
241
242 if (lRegion != null) {
243 rightRegionList.remove(lRegion);
244 leftRegionList.add(lRegion);
245 }
246 }
247 }
248
249 }
250
251 long endTime = System.currentTimeMillis();
252
253 if (initCost > currentCost) {
254 List<RegionPlan> plans = createRegionPlans(initialRegionMapping, clusterState);
255
256 LOG.debug("Finished computing new laod balance plan. Computation took "
257 + (endTime - startTime) + "ms to try " + computedMaxSteps
258 + " different iterations. Found a solution that moves " + plans.size()
259 + " regions; Going from a computed cost of " + initCost + " to a new cost of "
260 + currentCost);
261 return plans;
262 }
263 LOG.debug("Could not find a better load balance plan. Tried " + computedMaxSteps
264 + " different configurations in " + (endTime - startTime)
265 + "ms, and did not find anything with a computed cost less than " + initCost);
266 return null;
267 }
268
269
270
271
272
273
274
275
276
277 private List<RegionPlan> createRegionPlans(Map<HRegionInfo, ServerName> initialRegionMapping,
278 Map<ServerName, List<HRegionInfo>> clusterState) {
279 List<RegionPlan> plans = new LinkedList<RegionPlan>();
280
281 for (Entry<ServerName, List<HRegionInfo>> entry : clusterState.entrySet()) {
282 ServerName newServer = entry.getKey();
283
284 for (HRegionInfo region : entry.getValue()) {
285 ServerName initialServer = initialRegionMapping.get(region);
286 if (!newServer.equals(initialServer)) {
287 LOG.trace("Moving Region " + region.getEncodedName() + " from server "
288 + initialServer.getHostname() + " to " + newServer.getHostname());
289 RegionPlan rp = new RegionPlan(region, initialServer, newServer);
290 plans.add(rp);
291 }
292 }
293 }
294 return plans;
295 }
296
297
298
299
300
301
302
303
304
305 private Map<HRegionInfo, ServerName> createRegionMapping(
306 Map<ServerName, List<HRegionInfo>> clusterState) {
307 Map<HRegionInfo, ServerName> mapping = new HashMap<HRegionInfo, ServerName>();
308
309 for (Entry<ServerName, List<HRegionInfo>> entry : clusterState.entrySet()) {
310 for (HRegionInfo region : entry.getValue()) {
311 mapping.put(region, entry.getKey());
312 }
313 }
314 return mapping;
315 }
316
317
318 private synchronized void updateRegionLoad() {
319
320
321
322 Map<String, List<RegionLoad>> oldLoads = loads;
323 loads = new HashMap<String, List<RegionLoad>>();
324
325 for (ServerName sn : clusterStatus.getServers()) {
326 ServerLoad sl = clusterStatus.getLoad(sn);
327 if (sl == null) continue;
328 for (Entry<byte[], RegionLoad> entry : sl.getRegionsLoad().entrySet()) {
329 List<RegionLoad> rLoads = oldLoads.get(Bytes.toString(entry.getKey()));
330 if (rLoads != null) {
331
332
333 if (rLoads.size() >= numRegionLoadsToRemember) {
334 int numToRemove = 1 + (rLoads.size() - numRegionLoadsToRemember);
335
336 rLoads = rLoads.subList(numToRemove, rLoads.size());
337 }
338
339 } else {
340
341 rLoads = new ArrayList<RegionLoad>();
342 }
343 rLoads.add(entry.getValue());
344 loads.put(Bytes.toString(entry.getKey()), rLoads);
345
346 }
347 }
348 }
349
350
351
352
353
354
355
356
357
358
359
360
361 private HRegionInfo pickRandomRegion(List<HRegionInfo> regions, double chanceOfNoSwap) {
362
363
364 if (regions.isEmpty() || RANDOM.nextFloat() < chanceOfNoSwap) {
365
366 return null;
367 }
368
369 int count = 0;
370 HRegionInfo r = null;
371
372
373 while (count < 10 && r == null ) {
374 count++;
375 r = regions.get(RANDOM.nextInt(regions.size()));
376
377
378
379 if (r.isMetaRegion()) {
380 r = null;
381 }
382 }
383 if (r != null) {
384 regions.remove(r);
385 }
386 return r;
387 }
388
389
390
391
392
393
394
395
396
397 private ServerName pickOtherServer(ServerName server, List<ServerName> allServers) {
398 ServerName s = null;
399 int count = 0;
400 while (count < 100 && (s == null || s.equals(server))) {
401 count++;
402 s = allServers.get(RANDOM.nextInt(allServers.size()));
403 }
404
405
406 return (s == null || s.equals(server)) ? null : s;
407 }
408
409
410
411
412
413
414
415
416
417 protected double computeCost(Map<HRegionInfo, ServerName> initialRegionMapping,
418 Map<ServerName, List<HRegionInfo>> clusterState) {
419
420 double moveCost = moveCostMultiplier * computeMoveCost(initialRegionMapping, clusterState);
421
422 double regionCountSkewCost = loadMultiplier * computeSkewLoadCost(clusterState);
423 double tableSkewCost = tableMultiplier * computeTableSkewLoadCost(clusterState);
424 double localityCost =
425 localityMultiplier * computeDataLocalityCost(initialRegionMapping, clusterState);
426
427 double memstoreSizeCost =
428 memStoreSizeMultiplier
429 * computeRegionLoadCost(clusterState, RegionLoadCostType.MEMSTORE_SIZE);
430 double storefileSizeCost =
431 storeFileSizeMultiplier
432 * computeRegionLoadCost(clusterState, RegionLoadCostType.STOREFILE_SIZE);
433
434
435 double readRequestCost =
436 readRequestMultiplier
437 * computeRegionLoadCost(clusterState, RegionLoadCostType.READ_REQUEST);
438 double writeRequestCost =
439 writeRequestMultiplier
440 * computeRegionLoadCost(clusterState, RegionLoadCostType.WRITE_REQUEST);
441
442 double total =
443 moveCost + regionCountSkewCost + tableSkewCost + localityCost + memstoreSizeCost
444 + storefileSizeCost + readRequestCost + writeRequestCost;
445 LOG.trace("Computed weights for a potential balancing total = " + total + " moveCost = "
446 + moveCost + " regionCountSkewCost = " + regionCountSkewCost + " tableSkewCost = "
447 + tableSkewCost + " localityCost = " + localityCost + " memstoreSizeCost = "
448 + memstoreSizeCost + " storefileSizeCost = " + storefileSizeCost);
449 return total;
450 }
451
452
453
454
455
456
457
458
459
460 double computeMoveCost(Map<HRegionInfo, ServerName> initialRegionMapping,
461 Map<ServerName, List<HRegionInfo>> clusterState) {
462 float moveCost = 0;
463 for (Entry<ServerName, List<HRegionInfo>> entry : clusterState.entrySet()) {
464 for (HRegionInfo region : entry.getValue()) {
465 if (initialRegionMapping.get(region) != entry.getKey()) {
466 moveCost += 1;
467 }
468 }
469 }
470
471
472
473 if (moveCost > maxMoves) {
474 return 10000;
475 }
476
477 return scale(0, Math.min(maxMoves, initialRegionMapping.size()), moveCost);
478 }
479
480
481
482
483
484
485
486
487 double computeSkewLoadCost(Map<ServerName, List<HRegionInfo>> clusterState) {
488 DescriptiveStatistics stats = new DescriptiveStatistics();
489 for (List<HRegionInfo> regions : clusterState.values()) {
490 int size = regions.size();
491 stats.addValue(size);
492 }
493 return costFromStats(stats);
494 }
495
496
497
498
499
500
501
502
503 double computeTableSkewLoadCost(Map<ServerName, List<HRegionInfo>> clusterState) {
504
505 Map<String, MutableInt> tableRegionsTotal = new HashMap<String, MutableInt>();
506 Map<String, MutableInt> tableRegionsOnCurrentServer = new HashMap<String, MutableInt>();
507 Map<String, Integer> tableCostSeenSoFar = new HashMap<String, Integer>();
508
509 for (Entry<ServerName, List<HRegionInfo>> entry : clusterState.entrySet()) {
510 tableRegionsOnCurrentServer.clear();
511
512
513 for (HRegionInfo region : entry.getValue()) {
514 String tableName = region.getTableNameAsString();
515
516
517 MutableInt regionsOnServerCount = tableRegionsOnCurrentServer.get(tableName);
518
519
520
521 if (regionsOnServerCount == null) {
522 regionsOnServerCount = new MutableInt(0);
523 tableRegionsOnCurrentServer.put(tableName, regionsOnServerCount);
524 }
525
526
527
528 regionsOnServerCount.increment();
529
530
531 MutableInt totalCount = tableRegionsTotal.get(tableName);
532
533
534
535 if (totalCount == null) {
536 totalCount = new MutableInt(0);
537 tableRegionsTotal.put(tableName, totalCount);
538 }
539 totalCount.increment();
540 }
541
542
543
544 for (Entry<String, MutableInt> currentServerEntry: tableRegionsOnCurrentServer.entrySet()) {
545 String tableName = currentServerEntry.getKey();
546 Integer thisCount = currentServerEntry.getValue().toInteger();
547 Integer maxCountSoFar = tableCostSeenSoFar.get(tableName);
548
549 if (maxCountSoFar == null || thisCount.compareTo(maxCountSoFar) > 0) {
550 tableCostSeenSoFar.put(tableName, thisCount);
551 }
552 }
553 }
554
555 double max = 0;
556 double min = 0;
557 double value = 0;
558
559
560 for (Entry<String, MutableInt> currentEntry : tableRegionsTotal.entrySet()) {
561 max += tableRegionsTotal.get(currentEntry.getKey()).doubleValue();
562 min += tableRegionsTotal.get(currentEntry.getKey()).doubleValue() / clusterState.size();
563 value += tableCostSeenSoFar.get(currentEntry.getKey()).doubleValue();
564 }
565 return scale(min, max, value);
566 }
567
568
569
570
571
572
573
574
575
576
577 double computeDataLocalityCost(Map<HRegionInfo, ServerName> initialRegionMapping,
578 Map<ServerName, List<HRegionInfo>> clusterState) {
579
580 double max = 0;
581 double cost = 0;
582
583
584 if (this.services == null) return cost;
585
586 for (Entry<ServerName, List<HRegionInfo>> entry : clusterState.entrySet()) {
587 ServerName sn = entry.getKey();
588 for (HRegionInfo region : entry.getValue()) {
589
590 max += 1;
591
592 List<ServerName> dataOnServers = regionFinder.getTopBlockLocations(region);
593
594
595
596 if (dataOnServers == null) {
597 continue;
598 }
599
600 int index = dataOnServers.indexOf(sn);
601 if (index < 0) {
602 cost += 1;
603 } else {
604 cost += (double) index / (double) dataOnServers.size();
605 }
606
607 }
608 }
609 return scale(0, max, cost);
610 }
611
612
613 private enum RegionLoadCostType {
614 READ_REQUEST, WRITE_REQUEST, MEMSTORE_SIZE, STOREFILE_SIZE
615 }
616
617
618
619
620
621
622
623
624 private double computeRegionLoadCost(Map<ServerName, List<HRegionInfo>> clusterState,
625 RegionLoadCostType costType) {
626
627 if (this.clusterStatus == null || this.loads == null || this.loads.size() == 0) return 0;
628
629 DescriptiveStatistics stats = new DescriptiveStatistics();
630
631
632 for (List<HRegionInfo> regions : clusterState.values()) {
633 long cost = 0;
634
635
636 for (HRegionInfo region : regions) {
637
638 List<RegionLoad> rl = loads.get(region.getRegionNameAsString());
639
640
641 if (rl == null) {
642
643 rl = loads.get(region.getEncodedName());
644 }
645
646 if (rl != null) {
647 cost += getRegionLoadCost(rl, costType);
648 }
649 }
650
651
652 stats.addValue(cost);
653 }
654
655
656 return costFromStats(stats);
657 }
658
659
660
661
662
663
664
665
666 private double getRegionLoadCost(List<RegionLoad> regionLoadList, RegionLoadCostType type) {
667 double cost = 0;
668
669 int size = regionLoadList.size();
670 for(int i =0; i< size; i++) {
671 RegionLoad rl = regionLoadList.get(i);
672 double toAdd = 0;
673 switch (type) {
674 case READ_REQUEST:
675 toAdd = rl.getReadRequestsCount();
676 break;
677 case WRITE_REQUEST:
678 toAdd = rl.getWriteRequestsCount();
679 break;
680 case MEMSTORE_SIZE:
681 toAdd = rl.getMemStoreSizeMB();
682 break;
683 case STOREFILE_SIZE:
684 toAdd = rl.getStorefileSizeMB();
685 break;
686 default:
687 assert false : "RegionLoad cost type not supported.";
688 return 0;
689 }
690
691 if (cost == 0) {
692 cost = toAdd;
693 } else {
694 cost = (.5 * cost) + (.5 * toAdd);
695 }
696 }
697
698 return cost;
699
700 }
701
702
703
704
705
706
707
708
709
710 double costFromStats(DescriptiveStatistics stats) {
711 double totalCost = 0;
712 double mean = stats.getMean();
713
714
715
716 double max = ((stats.getN() - 1) * stats.getMean()) + (stats.getSum() - stats.getMean());
717 for (double n : stats.getValues()) {
718 totalCost += Math.abs(mean - n);
719
720 }
721
722 return scale(0, max, totalCost);
723 }
724
725
726
727
728
729
730
731
732
733 private double scale(double min, double max, double value) {
734 if (max == 0 || value == 0) {
735 return 0;
736 }
737
738 return Math.max(0d, Math.min(1d, (value - min) / max));
739 }
740 }