View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.master.balancer;
19  
20  import java.util.ArrayList;
21  import java.util.Arrays;
22  import java.util.Comparator;
23  import java.util.HashMap;
24  import java.util.List;
25  import java.util.Map;
26  import java.util.Map.Entry;
27  import java.util.Random;
28  import java.util.Set;
29  import java.util.TreeMap;
30  import java.util.NavigableMap;
31  
32  import org.apache.commons.logging.Log;
33  import org.apache.commons.logging.LogFactory;
34  import org.apache.hadoop.conf.Configuration;
35  import org.apache.hadoop.hbase.ClusterStatus;
36  import org.apache.hadoop.hbase.HRegionInfo;
37  import org.apache.hadoop.hbase.RegionLoad;
38  import org.apache.hadoop.hbase.ServerName;
39  import org.apache.hadoop.hbase.master.AssignmentManager;
40  import org.apache.hadoop.hbase.master.LoadBalancer;
41  import org.apache.hadoop.hbase.master.MasterServices;
42  
43  import com.google.common.base.Joiner;
44  import com.google.common.collect.ArrayListMultimap;
45  import com.google.common.collect.Sets;
46  
47  /**
48   * The base class for load balancers. It provides the the functions used to by
49   * {@link AssignmentManager} to assign regions in the edge cases. It doesn't
50   * provide an implementation of the actual balancing algorithm.
51   *
52   */
53  public abstract class BaseLoadBalancer implements LoadBalancer {
54  
55    /**
56     * An efficient array based implementation similar to ClusterState for keeping
57     * the status of the cluster in terms of region assignment and distribution.
58     * To be used by LoadBalancers.
59     */
60    protected static class Cluster {
61      ServerName[] servers;
62      ArrayList<String> tables;
63      HRegionInfo[] regions;
64      List<RegionLoad>[] regionLoads;
65      int[][] regionLocations; //regionIndex -> list of serverIndex sorted by locality
66  
67      int[][] regionsPerServer;            //serverIndex -> region list
68      int[]   regionIndexToServerIndex;    //regionIndex -> serverIndex
69      int[]   initialRegionIndexToServerIndex;    //regionIndex -> serverIndex (initial cluster state)
70      int[]   regionIndexToTableIndex;     //regionIndex -> tableIndex
71      int[][] numRegionsPerServerPerTable; //serverIndex -> tableIndex -> # regions
72      int[]   numMaxRegionsPerTable;       //tableIndex -> max number of regions in a single RS
73  
74      Integer[] serverIndicesSortedByRegionCount;
75  
76      Map<String, Integer> serversToIndex;
77      Map<String, Integer> tablesToIndex;
78  
79      int numRegions;
80      int numServers;
81      int numTables;
82  
83      int numMovedRegions = 0; //num moved regions from the initial configuration
84      int numMovedMetaRegions = 0;       //num of moved regions that are META
85  
86      protected Cluster(Map<ServerName, List<HRegionInfo>> clusterState,  Map<String, List<RegionLoad>> loads,
87          RegionLocationFinder regionFinder) {
88  
89        serversToIndex = new HashMap<String, Integer>();
90        tablesToIndex = new HashMap<String, Integer>();
91        //regionsToIndex = new HashMap<HRegionInfo, Integer>();
92  
93        //TODO: We should get the list of tables from master
94        tables = new ArrayList<String>();
95  
96  
97        numRegions = 0;
98  
99        int serverIndex = 0;
100 
101       // Use servername and port as there can be dead servers in this list. We want everything with
102       // a matching hostname and port to have the same index.
103       for (ServerName sn:clusterState.keySet()) {
104         if (serversToIndex.get(sn.getHostAndPort()) == null) {
105           serversToIndex.put(sn.getHostAndPort(), serverIndex++);
106         }
107       }
108 
109       // Count how many regions there are.
110       for (Entry<ServerName, List<HRegionInfo>> entry : clusterState.entrySet()) {
111         numRegions += entry.getValue().size();
112       }
113 
114       numServers = serversToIndex.size();
115       regionsPerServer = new int[serversToIndex.size()][];
116 
117       servers = new ServerName[numServers];
118       regions = new HRegionInfo[numRegions];
119       regionIndexToServerIndex = new int[numRegions];
120       initialRegionIndexToServerIndex = new int[numRegions];
121       regionIndexToTableIndex = new int[numRegions];
122       regionLoads = new List[numRegions];
123       regionLocations = new int[numRegions][];
124       serverIndicesSortedByRegionCount = new Integer[numServers];
125 
126       int tableIndex = 0, regionIndex = 0, regionPerServerIndex = 0;
127 
128       for (Entry<ServerName, List<HRegionInfo>> entry : clusterState.entrySet()) {
129         serverIndex = serversToIndex.get(entry.getKey().getHostAndPort());
130 
131         // keep the servername if this is the first server name for this hostname
132         // or this servername has the newest startcode.
133         if (servers[serverIndex] == null ||
134             servers[serverIndex].getStartcode() < entry.getKey().getStartcode()) {
135           servers[serverIndex] = entry.getKey();
136         }
137 
138         regionsPerServer[serverIndex] = new int[entry.getValue().size()];
139         serverIndicesSortedByRegionCount[serverIndex] = serverIndex;
140       }
141 
142       for (Entry<ServerName, List<HRegionInfo>> entry : clusterState.entrySet()) {
143         serverIndex = serversToIndex.get(entry.getKey().getHostAndPort());
144         regionPerServerIndex = 0;
145 
146         for (HRegionInfo region : entry.getValue()) {
147           String tableName = region.getTableName().getNameAsString();
148           Integer idx = tablesToIndex.get(tableName);
149           if (idx == null) {
150             tables.add(tableName);
151             idx = tableIndex;
152             tablesToIndex.put(tableName, tableIndex++);
153           }
154 
155           regions[regionIndex] = region;
156           regionIndexToServerIndex[regionIndex] = serverIndex;
157           initialRegionIndexToServerIndex[regionIndex] = serverIndex;
158           regionIndexToTableIndex[regionIndex] = idx;
159           regionsPerServer[serverIndex][regionPerServerIndex++] = regionIndex;
160 
161           // region load
162           if (loads != null) {
163             List<RegionLoad> rl = loads.get(region.getRegionNameAsString());
164             // That could have failed if the RegionLoad is using the other regionName
165             if (rl == null) {
166               // Try getting the region load using encoded name.
167               rl = loads.get(region.getEncodedName());
168             }
169             regionLoads[regionIndex] = rl;
170           }
171 
172           if (regionFinder != null) {
173             //region location
174             List<ServerName> loc = regionFinder.getTopBlockLocations(region);
175             regionLocations[regionIndex] = new int[loc.size()];
176             for (int i=0; i < loc.size(); i++) {
177               regionLocations[regionIndex][i] =
178                   loc.get(i) == null ? -1 :
179                     (serversToIndex.get(loc.get(i)) == null ? -1 : serversToIndex.get(loc.get(i)));
180             }
181           }
182 
183           regionIndex++;
184         }
185       }
186 
187       numTables = tables.size();
188       numRegionsPerServerPerTable = new int[numServers][numTables];
189 
190       for (int i = 0; i < numServers; i++) {
191         for (int j = 0; j < numTables; j++) {
192           numRegionsPerServerPerTable[i][j] = 0;
193         }
194       }
195 
196       for (int i=0; i < regionIndexToServerIndex.length; i++) {
197         numRegionsPerServerPerTable[regionIndexToServerIndex[i]][regionIndexToTableIndex[i]]++;
198       }
199 
200       numMaxRegionsPerTable = new int[numTables];
201       for (serverIndex = 0 ; serverIndex < numRegionsPerServerPerTable.length; serverIndex++) {
202         for (tableIndex = 0 ; tableIndex < numRegionsPerServerPerTable[serverIndex].length; tableIndex++) {
203           if (numRegionsPerServerPerTable[serverIndex][tableIndex] > numMaxRegionsPerTable[tableIndex]) {
204             numMaxRegionsPerTable[tableIndex] = numRegionsPerServerPerTable[serverIndex][tableIndex];
205           }
206         }
207       }
208     }
209 
210     public void moveOrSwapRegion(int lServer, int rServer, int lRegion, int rRegion) {
211       //swap
212       if (rRegion >= 0 && lRegion >= 0) {
213         regionMoved(rRegion, rServer, lServer);
214         regionsPerServer[rServer] = replaceRegion(regionsPerServer[rServer], rRegion, lRegion);
215         regionMoved(lRegion, lServer, rServer);
216         regionsPerServer[lServer] = replaceRegion(regionsPerServer[lServer], lRegion, rRegion);
217       } else if (rRegion >= 0) { //move rRegion
218         regionMoved(rRegion, rServer, lServer);
219         regionsPerServer[rServer] = removeRegion(regionsPerServer[rServer], rRegion);
220         regionsPerServer[lServer] = addRegion(regionsPerServer[lServer], rRegion);
221       } else if (lRegion >= 0) { //move lRegion
222         regionMoved(lRegion, lServer, rServer);
223         regionsPerServer[lServer] = removeRegion(regionsPerServer[lServer], lRegion);
224         regionsPerServer[rServer] = addRegion(regionsPerServer[rServer], lRegion);
225       }
226     }
227 
228     /** Region moved out of the server */
229     void regionMoved(int regionIndex, int oldServerIndex, int newServerIndex) {
230       regionIndexToServerIndex[regionIndex] = newServerIndex;
231       if (initialRegionIndexToServerIndex[regionIndex] == newServerIndex) {
232         numMovedRegions--; //region moved back to original location
233         if (regions[regionIndex].isMetaRegion()) {
234           numMovedMetaRegions--;
235         }
236       } else if (initialRegionIndexToServerIndex[regionIndex] == oldServerIndex) {
237         numMovedRegions++; //region moved from original location
238         if (regions[regionIndex].isMetaRegion()) {
239           numMovedMetaRegions++;
240         }
241       }
242       int tableIndex = regionIndexToTableIndex[regionIndex];
243       numRegionsPerServerPerTable[oldServerIndex][tableIndex]--;
244       numRegionsPerServerPerTable[newServerIndex][tableIndex]++;
245 
246       //check whether this caused maxRegionsPerTable in the new Server to be updated
247       if (numRegionsPerServerPerTable[newServerIndex][tableIndex] > numMaxRegionsPerTable[tableIndex]) {
248         numRegionsPerServerPerTable[newServerIndex][tableIndex] = numMaxRegionsPerTable[tableIndex];
249       } else if ((numRegionsPerServerPerTable[oldServerIndex][tableIndex] + 1)
250           == numMaxRegionsPerTable[tableIndex]) {
251         //recompute maxRegionsPerTable since the previous value was coming from the old server
252         for (int serverIndex = 0 ; serverIndex < numRegionsPerServerPerTable.length; serverIndex++) {
253           if (numRegionsPerServerPerTable[serverIndex][tableIndex] > numMaxRegionsPerTable[tableIndex]) {
254             numMaxRegionsPerTable[tableIndex] = numRegionsPerServerPerTable[serverIndex][tableIndex];
255           }
256         }
257       }
258     }
259 
260     int[] removeRegion(int[] regions, int regionIndex) {
261       //TODO: this maybe costly. Consider using linked lists
262       int[] newRegions = new int[regions.length - 1];
263       int i = 0;
264       for (i = 0; i < regions.length; i++) {
265         if (regions[i] == regionIndex) {
266           break;
267         }
268         newRegions[i] = regions[i];
269       }
270       System.arraycopy(regions, i+1, newRegions, i, newRegions.length - i);
271       return newRegions;
272     }
273 
274     int[] addRegion(int[] regions, int regionIndex) {
275       int[] newRegions = new int[regions.length + 1];
276       System.arraycopy(regions, 0, newRegions, 0, regions.length);
277       newRegions[newRegions.length - 1] = regionIndex;
278       return newRegions;
279     }
280 
281     int[] replaceRegion(int[] regions, int regionIndex, int newRegionIndex) {
282       int i = 0;
283       for (i = 0; i < regions.length; i++) {
284         if (regions[i] == regionIndex) {
285           regions[i] = newRegionIndex;
286           break;
287         }
288       }
289       return regions;
290     }
291 
292     void sortServersByRegionCount() {
293       Arrays.sort(serverIndicesSortedByRegionCount, numRegionsComparator);
294     }
295 
296     int getNumRegions(int server) {
297       return regionsPerServer[server].length;
298     }
299 
300     private Comparator<Integer> numRegionsComparator = new Comparator<Integer>() {
301       @Override
302       public int compare(Integer integer, Integer integer2) {
303         return Integer.valueOf(getNumRegions(integer)).compareTo(getNumRegions(integer2));
304       }
305     };
306 
307     @Override
308     public String toString() {
309       String desc = "Cluster{" +
310           "servers=[";
311           for(ServerName sn:servers) {
312              desc += sn.getHostAndPort() + ", ";
313           }
314           desc +=
315           ", serverIndicesSortedByRegionCount="+
316           Arrays.toString(serverIndicesSortedByRegionCount) +
317           ", regionsPerServer=[";
318 
319           for (int[]r:regionsPerServer) {
320             desc += Arrays.toString(r);
321           }
322           desc += "]" +
323           ", numMaxRegionsPerTable=" +
324           Arrays.toString(numMaxRegionsPerTable) +
325           ", numRegions=" +
326           numRegions +
327           ", numServers=" +
328           numServers +
329           ", numTables=" +
330           numTables +
331           ", numMovedRegions=" +
332           numMovedRegions +
333           ", numMovedMetaRegions=" +
334           numMovedMetaRegions +
335           '}';
336       return desc;
337     }
338   }
339 
340   // slop for regions
341   private float slop;
342   private Configuration config;
343   private static final Random RANDOM = new Random(System.currentTimeMillis());
344   private static final Log LOG = LogFactory.getLog(BaseLoadBalancer.class);
345   protected MasterServices services;
346 
347   @Override
348   public void setConf(Configuration conf) {
349     this.slop = conf.getFloat("hbase.regions.slop", (float) 0.2);
350     if (slop < 0) slop = 0;
351     else if (slop > 1) slop = 1;
352     this.config = conf;
353   }
354 
355   @Override
356   public Configuration getConf() {
357     return this.config;
358   }
359 
360   public void setClusterStatus(ClusterStatus st) {
361     // Not used except for the StocasticBalancer
362   }
363 
364   public void setMasterServices(MasterServices masterServices) {
365     this.services = masterServices;
366   }
367 
368   protected boolean needsBalance(ClusterLoadState cs) {
369     if (cs.getNumServers() == 0) {
370       LOG.debug("numServers=0 so skipping load balancing");
371       return false;
372     }
373     // Check if we even need to do any load balancing
374     // HBASE-3681 check sloppiness first
375     float average = cs.getLoadAverage(); // for logging
376     int floor = (int) Math.floor(average * (1 - slop));
377     int ceiling = (int) Math.ceil(average * (1 + slop));
378     if (!(cs.getMinLoad() > ceiling || cs.getMaxLoad() < floor)) {
379       NavigableMap<ServerAndLoad, List<HRegionInfo>> serversByLoad = cs.getServersByLoad();
380       if (LOG.isTraceEnabled()) {
381         // If nothing to balance, then don't say anything unless trace-level logging.
382         LOG.trace("Skipping load balancing because balanced cluster; " +
383           "servers=" + cs.getNumServers() + " " +
384           "regions=" + cs.getNumRegions() + " average=" + average + " " +
385           "mostloaded=" + serversByLoad.lastKey().getLoad() +
386           " leastloaded=" + serversByLoad.firstKey().getLoad());
387       }
388       return false;
389     }
390     return true;
391   }
392 
393   /**
394    * Generates a bulk assignment plan to be used on cluster startup using a
395    * simple round-robin assignment.
396    * <p>
397    * Takes a list of all the regions and all the servers in the cluster and
398    * returns a map of each server to the regions that it should be assigned.
399    * <p>
400    * Currently implemented as a round-robin assignment. Same invariant as load
401    * balancing, all servers holding floor(avg) or ceiling(avg).
402    *
403    * TODO: Use block locations from HDFS to place regions with their blocks
404    *
405    * @param regions all regions
406    * @param servers all servers
407    * @return map of server to the regions it should take, or null if no
408    *         assignment is possible (ie. no regions or no servers)
409    */
410   public Map<ServerName, List<HRegionInfo>> roundRobinAssignment(List<HRegionInfo> regions,
411       List<ServerName> servers) {
412     if (regions.isEmpty() || servers.isEmpty()) {
413       return null;
414     }
415     Map<ServerName, List<HRegionInfo>> assignments = new TreeMap<ServerName, List<HRegionInfo>>();
416     int numRegions = regions.size();
417     int numServers = servers.size();
418     int max = (int) Math.ceil((float) numRegions / numServers);
419     int serverIdx = 0;
420     if (numServers > 1) {
421       serverIdx = RANDOM.nextInt(numServers);
422     }
423     int regionIdx = 0;
424     for (int j = 0; j < numServers; j++) {
425       ServerName server = servers.get((j + serverIdx) % numServers);
426       List<HRegionInfo> serverRegions = new ArrayList<HRegionInfo>(max);
427       for (int i = regionIdx; i < numRegions; i += numServers) {
428         serverRegions.add(regions.get(i % numRegions));
429       }
430       assignments.put(server, serverRegions);
431       regionIdx++;
432     }
433     return assignments;
434   }
435 
436   /**
437    * Generates an immediate assignment plan to be used by a new master for
438    * regions in transition that do not have an already known destination.
439    *
440    * Takes a list of regions that need immediate assignment and a list of all
441    * available servers. Returns a map of regions to the server they should be
442    * assigned to.
443    *
444    * This method will return quickly and does not do any intelligent balancing.
445    * The goal is to make a fast decision not the best decision possible.
446    *
447    * Currently this is random.
448    *
449    * @param regions
450    * @param servers
451    * @return map of regions to the server it should be assigned to
452    */
453   public Map<HRegionInfo, ServerName> immediateAssignment(List<HRegionInfo> regions,
454       List<ServerName> servers) {
455     Map<HRegionInfo, ServerName> assignments = new TreeMap<HRegionInfo, ServerName>();
456     for (HRegionInfo region : regions) {
457       assignments.put(region, randomAssignment(region, servers));
458     }
459     return assignments;
460   }
461 
462   /**
463    * Used to assign a single region to a random server.
464    */
465   public ServerName randomAssignment(HRegionInfo regionInfo, List<ServerName> servers) {
466     if (servers == null || servers.isEmpty()) {
467       LOG.warn("Wanted to do random assignment but no servers to assign to");
468       return null;
469     }
470     return servers.get(RANDOM.nextInt(servers.size()));
471   }
472 
473   /**
474    * Generates a bulk assignment startup plan, attempting to reuse the existing
475    * assignment information from META, but adjusting for the specified list of
476    * available/online servers available for assignment.
477    * <p>
478    * Takes a map of all regions to their existing assignment from META. Also
479    * takes a list of online servers for regions to be assigned to. Attempts to
480    * retain all assignment, so in some instances initial assignment will not be
481    * completely balanced.
482    * <p>
483    * Any leftover regions without an existing server to be assigned to will be
484    * assigned randomly to available servers.
485    *
486    * @param regions regions and existing assignment from meta
487    * @param servers available servers
488    * @return map of servers and regions to be assigned to them
489    */
490   public Map<ServerName, List<HRegionInfo>> retainAssignment(Map<HRegionInfo, ServerName> regions,
491       List<ServerName> servers) {
492     // Group all of the old assignments by their hostname.
493     // We can't group directly by ServerName since the servers all have
494     // new start-codes.
495 
496     // Group the servers by their hostname. It's possible we have multiple
497     // servers on the same host on different ports.
498     ArrayListMultimap<String, ServerName> serversByHostname = ArrayListMultimap.create();
499     for (ServerName server : servers) {
500       serversByHostname.put(server.getHostname(), server);
501     }
502 
503     // Now come up with new assignments
504     Map<ServerName, List<HRegionInfo>> assignments = new TreeMap<ServerName, List<HRegionInfo>>();
505 
506     for (ServerName server : servers) {
507       assignments.put(server, new ArrayList<HRegionInfo>());
508     }
509 
510     // Collection of the hostnames that used to have regions
511     // assigned, but for which we no longer have any RS running
512     // after the cluster restart.
513     Set<String> oldHostsNoLongerPresent = Sets.newTreeSet();
514 
515     int numRandomAssignments = 0;
516     int numRetainedAssigments = 0;
517     for (Map.Entry<HRegionInfo, ServerName> entry : regions.entrySet()) {
518       HRegionInfo region = entry.getKey();
519       ServerName oldServerName = entry.getValue();
520       List<ServerName> localServers = new ArrayList<ServerName>();
521       if (oldServerName != null) {
522         localServers = serversByHostname.get(oldServerName.getHostname());
523       }
524       if (localServers.isEmpty()) {
525         // No servers on the new cluster match up with this hostname,
526         // assign randomly.
527         ServerName randomServer = servers.get(RANDOM.nextInt(servers.size()));
528         assignments.get(randomServer).add(region);
529         numRandomAssignments++;
530         if (oldServerName != null) oldHostsNoLongerPresent.add(oldServerName.getHostname());
531       } else if (localServers.size() == 1) {
532         // the usual case - one new server on same host
533         assignments.get(localServers.get(0)).add(region);
534         numRetainedAssigments++;
535       } else {
536         // multiple new servers in the cluster on this same host
537         int size = localServers.size();
538         ServerName target = localServers.get(RANDOM.nextInt(size));
539         assignments.get(target).add(region);
540         numRetainedAssigments++;
541       }
542     }
543 
544     String randomAssignMsg = "";
545     if (numRandomAssignments > 0) {
546       randomAssignMsg =
547           numRandomAssignments + " regions were assigned "
548               + "to random hosts, since the old hosts for these regions are no "
549               + "longer present in the cluster. These hosts were:\n  "
550               + Joiner.on("\n  ").join(oldHostsNoLongerPresent);
551     }
552 
553     LOG.info("Reassigned " + regions.size() + " regions. " + numRetainedAssigments
554         + " retained the pre-restart assignment. " + randomAssignMsg);
555     return assignments;
556   }
557 
558 }