View Javadoc

1   /**
2    *
3    * Licensed to the Apache Software Foundation (ASF) under one
4    * or more contributor license agreements.  See the NOTICE file
5    * distributed with this work for additional information
6    * regarding copyright ownership.  The ASF licenses this file
7    * to you under the Apache License, Version 2.0 (the
8    * "License"); you may not use this file except in compliance
9    * with the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  
20  package org.apache.hadoop.hbase.master.balancer;
21  
22  import java.io.IOException;
23  import java.util.ArrayList;
24  import java.util.HashMap;
25  import java.util.HashSet;
26  import java.util.List;
27  import java.util.Map;
28  import java.util.Random;
29  import java.util.Set;
30  import java.util.TreeMap;
31  
32  import org.apache.commons.logging.Log;
33  import org.apache.commons.logging.LogFactory;
34  import org.apache.hadoop.classification.InterfaceAudience;
35  import org.apache.hadoop.conf.Configuration;
36  import org.apache.hadoop.hbase.TableName;
37  import org.apache.hadoop.hbase.HConstants;
38  import org.apache.hadoop.hbase.HRegionInfo;
39  import org.apache.hadoop.hbase.ServerName;
40  import org.apache.hadoop.hbase.catalog.CatalogTracker;
41  import org.apache.hadoop.hbase.catalog.MetaEditor;
42  import org.apache.hadoop.hbase.catalog.MetaReader;
43  import org.apache.hadoop.hbase.catalog.MetaReader.Visitor;
44  import org.apache.hadoop.hbase.client.Put;
45  import org.apache.hadoop.hbase.client.Result;
46  import org.apache.hadoop.hbase.master.RackManager;
47  import org.apache.hadoop.hbase.protobuf.ProtobufUtil;
48  import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos;
49  import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos.FavoredNodes;
50  import org.apache.hadoop.hbase.util.Bytes;
51  import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
52  import org.apache.hadoop.hbase.util.Pair;
53  
54  import com.google.protobuf.InvalidProtocolBufferException;
55  
56  /**
57   * Helper class for {@link FavoredNodeLoadBalancer} that has all the intelligence
58   * for racks, meta scans, etc. Instantiated by the {@link FavoredNodeLoadBalancer}
59   * when needed (from within calls like
60   * {@link FavoredNodeLoadBalancer#randomAssignment(HRegionInfo, List)}).
61   *
62   */
63  @InterfaceAudience.Private
64  public class FavoredNodeAssignmentHelper {
65    private static final Log LOG = LogFactory.getLog(FavoredNodeAssignmentHelper.class);
66    private RackManager rackManager;
67    private Map<String, List<ServerName>> rackToRegionServerMap;
68    private List<String> uniqueRackList;
69    private Map<ServerName, String> regionServerToRackMap;
70    private Random random;
71    private List<ServerName> servers;
72    public static final byte [] FAVOREDNODES_QUALIFIER = Bytes.toBytes("fn");
73    public final static short FAVORED_NODES_NUM = 3;
74  
75    public FavoredNodeAssignmentHelper(final List<ServerName> servers, Configuration conf) {
76      this(servers, new RackManager(conf));
77    }
78  
79    public FavoredNodeAssignmentHelper(final List<ServerName> servers,
80        final RackManager rackManager) {
81      this.servers = servers;
82      this.rackManager = rackManager;
83      this.rackToRegionServerMap = new HashMap<String, List<ServerName>>();
84      this.regionServerToRackMap = new HashMap<ServerName, String>();
85      this.uniqueRackList = new ArrayList<String>();
86      this.random = new Random();
87    }
88  
89    /**
90     * Perform full scan of the meta table similar to
91     * {@link MetaReader#fullScan(CatalogTracker, Set, boolean)} except that this is
92     * aware of the favored nodes
93     * @param catalogTracker
94     * @param disabledTables
95     * @param excludeOfflinedSplitParents
96     * @param balancer required because we need to let the balancer know about the
97     * current favored nodes from meta scan
98     * @return Returns a map of every region to it's currently assigned server,
99     * according to META.  If the region does not have an assignment it will have
100    * a null value in the map.
101    * @throws IOException
102    */
103   public static Map<HRegionInfo, ServerName> fullScan(
104       CatalogTracker catalogTracker, final Set<TableName> disabledTables,
105       final boolean excludeOfflinedSplitParents,
106       FavoredNodeLoadBalancer balancer) throws IOException {
107     final Map<HRegionInfo, ServerName> regions =
108         new TreeMap<HRegionInfo, ServerName>();
109     final Map<HRegionInfo, ServerName[]> favoredNodesMap =
110         new HashMap<HRegionInfo, ServerName[]>();
111     Visitor v = new Visitor() {
112       @Override
113       public boolean visit(Result r) throws IOException {
114         if (r ==  null || r.isEmpty()) return true;
115         Pair<HRegionInfo, ServerName> region = HRegionInfo.getHRegionInfoAndServerName(r);
116         HRegionInfo hri = region.getFirst();
117         if (hri  == null) return true;
118         if (hri.getTableName() == null) return true;
119         if (disabledTables.contains(
120             hri.getTableName())) return true;
121         // Are we to include split parents in the list?
122         if (excludeOfflinedSplitParents && hri.isSplitParent()) return true;
123         regions.put(hri, region.getSecond());
124         byte[] favoredNodes = r.getValue(HConstants.CATALOG_FAMILY,
125                FavoredNodeAssignmentHelper.FAVOREDNODES_QUALIFIER);
126         if (favoredNodes != null) {
127           ServerName[] favoredServerList =
128             FavoredNodeAssignmentHelper.getFavoredNodesList(favoredNodes);
129           favoredNodesMap.put(hri, favoredServerList);
130         }
131         return true;
132       }
133     };
134     MetaReader.fullScan(catalogTracker, v);
135     balancer.noteFavoredNodes(favoredNodesMap);
136     return regions;
137   }
138 
139   public static void updateMetaWithFavoredNodesInfo(
140       Map<HRegionInfo, List<ServerName>> regionToFavoredNodes,
141       CatalogTracker catalogTracker) throws IOException {
142     List<Put> puts = new ArrayList<Put>();
143     for (Map.Entry<HRegionInfo, List<ServerName>> entry : regionToFavoredNodes.entrySet()) {
144       Put put = makePutFromRegionInfo(entry.getKey(), entry.getValue());
145       if (put != null) {
146         puts.add(put);
147       }
148     }
149     MetaEditor.putsToMetaTable(catalogTracker, puts);
150     LOG.info("Added " + puts.size() + " regions in META");
151   }
152 
153   /**
154    * Generates and returns a Put containing the region info for the catalog table
155    * and the servers
156    * @param regionInfo
157    * @param favoredNodeList
158    * @return Put object
159    */
160   static Put makePutFromRegionInfo(HRegionInfo regionInfo, List<ServerName>favoredNodeList)
161   throws IOException {
162     Put put = null;
163     if (favoredNodeList != null) {
164       put = MetaEditor.makePutFromRegionInfo(regionInfo);
165       byte[] favoredNodes = getFavoredNodes(favoredNodeList);
166       put.add(HConstants.CATALOG_FAMILY, FAVOREDNODES_QUALIFIER,
167           EnvironmentEdgeManager.currentTimeMillis(), favoredNodes);
168       LOG.info("Create the region " + regionInfo.getRegionNameAsString() +
169           " with favored nodes " + favoredNodes);
170     }
171     return put;
172   }
173 
174   /**
175    * @param favoredNodes The PB'ed bytes of favored nodes
176    * @return the array of {@link ServerName} for the byte array of favored nodes.
177    * @throws InvalidProtocolBufferException
178    */
179   public static ServerName[] getFavoredNodesList(byte[] favoredNodes)
180       throws InvalidProtocolBufferException {
181     FavoredNodes f = FavoredNodes.parseFrom(favoredNodes);
182     List<HBaseProtos.ServerName> protoNodes = f.getFavoredNodeList();
183     ServerName[] servers = new ServerName[protoNodes.size()];
184     int i = 0;
185     for (HBaseProtos.ServerName node : protoNodes) {
186       servers[i++] = ProtobufUtil.toServerName(node);
187     }
188     return servers;
189   }
190 
191   /**
192    * @param serverList
193    * @return PB'ed bytes of {@link FavoredNodes} generated by the server list.
194    */
195   static byte[] getFavoredNodes(List<ServerName> serverAddrList) {
196     FavoredNodes.Builder f = FavoredNodes.newBuilder();
197     for (ServerName s : serverAddrList) {
198       HBaseProtos.ServerName.Builder b = HBaseProtos.ServerName.newBuilder();
199       b.setHostName(s.getHostname());
200       b.setPort(s.getPort());
201       b.setStartCode(s.getStartcode());
202       f.addFavoredNode(b.build());
203     }
204     return f.build().toByteArray();
205   }
206 
207   // Place the regions round-robin across the racks picking one server from each
208   // rack at a time. Start with a random rack, and a random server from every rack.
209   // If a rack doesn't have enough servers it will go to the next rack and so on.
210   // for choosing a primary.
211   // For example, if 4 racks (r1 .. r4) with 8 servers (s1..s8) each, one possible
212   // placement could be r2:s5, r3:s5, r4:s5, r1:s5, r2:s6, r3:s6..
213   // If there were fewer servers in one rack, say r3, which had 3 servers, one possible
214   // placement could be r2:s5, <skip-r3>, r4:s5, r1:s5, r2:s6, <skip-r3> ...
215   // The regions should be distributed proportionately to the racksizes
216   void placePrimaryRSAsRoundRobin(Map<ServerName, List<HRegionInfo>> assignmentMap,
217       Map<HRegionInfo, ServerName> primaryRSMap, List<HRegionInfo> regions) {
218     List<String> rackList = new ArrayList<String>(rackToRegionServerMap.size());
219     rackList.addAll(rackToRegionServerMap.keySet());
220     int rackIndex = random.nextInt(rackList.size());
221     int maxRackSize = 0;
222     for (Map.Entry<String,List<ServerName>> r : rackToRegionServerMap.entrySet()) {
223       if (r.getValue().size() > maxRackSize) {
224         maxRackSize = r.getValue().size();
225       }
226     }
227     int numIterations = 0;
228     int firstServerIndex = random.nextInt(maxRackSize);
229     // Initialize the current processing host index.
230     int serverIndex = firstServerIndex;
231     for (HRegionInfo regionInfo : regions) {
232       List<ServerName> currentServerList;
233       String rackName;
234       while (true) {
235         rackName = rackList.get(rackIndex);
236         numIterations++;
237         // Get the server list for the current rack
238         currentServerList = rackToRegionServerMap.get(rackName);
239         
240         if (serverIndex >= currentServerList.size()) { //not enough machines in this rack
241           if (numIterations % rackList.size() == 0) {
242             if (++serverIndex >= maxRackSize) serverIndex = 0;
243           }
244           if ((++rackIndex) >= rackList.size()) {
245             rackIndex = 0; // reset the rack index to 0
246           }
247         } else break;
248       }
249 
250       // Get the current process region server
251       ServerName currentServer = currentServerList.get(serverIndex);
252 
253       // Place the current region with the current primary region server
254       primaryRSMap.put(regionInfo, currentServer);
255       List<HRegionInfo> regionsForServer = assignmentMap.get(currentServer);
256       if (regionsForServer == null) {
257         regionsForServer = new ArrayList<HRegionInfo>();
258         assignmentMap.put(currentServer, regionsForServer);
259       }
260       regionsForServer.add(regionInfo);
261 
262       // Set the next processing index
263       if (numIterations % rackList.size() == 0) {
264         ++serverIndex;
265       }
266       if ((++rackIndex) >= rackList.size()) {
267         rackIndex = 0; // reset the rack index to 0
268       }
269     }
270   }
271 
272   Map<HRegionInfo, ServerName[]> placeSecondaryAndTertiaryRS(
273       Map<HRegionInfo, ServerName> primaryRSMap) {
274     Map<HRegionInfo, ServerName[]> secondaryAndTertiaryMap =
275         new HashMap<HRegionInfo, ServerName[]>();
276     for (Map.Entry<HRegionInfo, ServerName> entry : primaryRSMap.entrySet()) {
277       // Get the target region and its primary region server rack
278       HRegionInfo regionInfo = entry.getKey();
279       ServerName primaryRS = entry.getValue();
280       try {
281         // Create the secondary and tertiary region server pair object.
282         ServerName[] favoredNodes;
283         // Get the rack for the primary region server
284         String primaryRack = rackManager.getRack(primaryRS);
285 
286         if (getTotalNumberOfRacks() == 1) {
287           favoredNodes = singleRackCase(regionInfo, primaryRS, primaryRack);
288         } else {
289           favoredNodes = multiRackCase(regionInfo, primaryRS, primaryRack);
290         }
291         if (favoredNodes != null) {
292           secondaryAndTertiaryMap.put(regionInfo, favoredNodes);
293           LOG.debug("Place the secondary and tertiary region server for region "
294               + regionInfo.getRegionNameAsString());
295         }
296       } catch (Exception e) {
297         LOG.warn("Cannot place the favored nodes for region " +
298             regionInfo.getRegionNameAsString() + " because " + e);
299         continue;
300       }
301     }
302     return secondaryAndTertiaryMap;
303   }
304 
305   private ServerName[] singleRackCase(HRegionInfo regionInfo,
306       ServerName primaryRS,
307       String primaryRack) throws IOException {
308     // Single rack case: have to pick the secondary and tertiary
309     // from the same rack
310     List<ServerName> serverList = getServersFromRack(primaryRack);
311     if (serverList.size() <= 2) {
312       // Single region server case: cannot not place the favored nodes
313       // on any server; !domain.canPlaceFavoredNodes()
314       return null;
315     } else {
316       // Randomly select two region servers from the server list and make sure
317       // they are not overlap with the primary region server;
318      Set<ServerName> serverSkipSet = new HashSet<ServerName>();
319      serverSkipSet.add(primaryRS);
320 
321      // Place the secondary RS
322      ServerName secondaryRS = getOneRandomServer(primaryRack, serverSkipSet);
323      // Skip the secondary for the tertiary placement
324      serverSkipSet.add(secondaryRS);
325 
326      // Place the tertiary RS
327      ServerName tertiaryRS =
328        getOneRandomServer(primaryRack, serverSkipSet);
329 
330      if (secondaryRS == null || tertiaryRS == null) {
331        LOG.error("Cannot place the secondary and terinary" +
332            "region server for region " +
333            regionInfo.getRegionNameAsString());
334      }
335      // Create the secondary and tertiary pair
336      ServerName[] favoredNodes = new ServerName[2];
337      favoredNodes[0] = secondaryRS;
338      favoredNodes[1] = tertiaryRS;
339      return favoredNodes;
340     }
341   }
342 
343   private ServerName[] multiRackCase(HRegionInfo regionInfo,
344       ServerName primaryRS,
345       String primaryRack) throws IOException {
346 
347     // Random to choose the secondary and tertiary region server
348     // from another rack to place the secondary and tertiary
349 
350     // Random to choose one rack except for the current rack
351     Set<String> rackSkipSet = new HashSet<String>();
352     rackSkipSet.add(primaryRack);
353     ServerName[] favoredNodes = new ServerName[2];
354     String secondaryRack = getOneRandomRack(rackSkipSet);
355     List<ServerName> serverList = getServersFromRack(secondaryRack);
356     if (serverList.size() >= 2) {
357       // Randomly pick up two servers from this secondary rack
358 
359       // Place the secondary RS
360       ServerName secondaryRS = getOneRandomServer(secondaryRack);
361 
362       // Skip the secondary for the tertiary placement
363       Set<ServerName> skipServerSet = new HashSet<ServerName>();
364       skipServerSet.add(secondaryRS);
365       // Place the tertiary RS
366       ServerName tertiaryRS = getOneRandomServer(secondaryRack, skipServerSet);
367 
368       if (secondaryRS == null || tertiaryRS == null) {
369         LOG.error("Cannot place the secondary and terinary" +
370             "region server for region " +
371             regionInfo.getRegionNameAsString());
372       }
373       // Create the secondary and tertiary pair
374       favoredNodes[0] = secondaryRS;
375       favoredNodes[1] = tertiaryRS;
376     } else {
377       // Pick the secondary rs from this secondary rack
378       // and pick the tertiary from another random rack
379       favoredNodes[0] = getOneRandomServer(secondaryRack);
380 
381       // Pick the tertiary
382       if (getTotalNumberOfRacks() == 2) {
383         // Pick the tertiary from the same rack of the primary RS
384         Set<ServerName> serverSkipSet = new HashSet<ServerName>();
385         serverSkipSet.add(primaryRS);
386         favoredNodes[1] = getOneRandomServer(primaryRack, serverSkipSet);
387       } else {
388         // Pick the tertiary from another rack
389         rackSkipSet.add(secondaryRack);
390         String tertiaryRandomRack = getOneRandomRack(rackSkipSet);
391         favoredNodes[1] = getOneRandomServer(tertiaryRandomRack);
392       }
393     }
394     return favoredNodes;
395   }
396 
397   boolean canPlaceFavoredNodes() {
398     int serverSize = this.regionServerToRackMap.size();
399     return (serverSize >= FAVORED_NODES_NUM);
400   }
401 
402   void initialize() {
403     for (ServerName sn : this.servers) {
404       String rackName = this.rackManager.getRack(sn);
405       List<ServerName> serverList = this.rackToRegionServerMap.get(rackName);
406       if (serverList == null) {
407         serverList = new ArrayList<ServerName>();
408         // Add the current rack to the unique rack list
409         this.uniqueRackList.add(rackName);
410       }
411       if (!serverList.contains(sn)) {
412         serverList.add(sn);
413         this.rackToRegionServerMap.put(rackName, serverList);
414         this.regionServerToRackMap.put(sn, rackName);
415       }
416     }
417   }
418 
419   private int getTotalNumberOfRacks() {
420     return this.uniqueRackList.size();
421   }
422 
423   private List<ServerName> getServersFromRack(String rack) {
424     return this.rackToRegionServerMap.get(rack);
425   }
426 
427   private ServerName getOneRandomServer(String rack,
428       Set<ServerName> skipServerSet) throws IOException {
429     if(rack == null) return null;
430     List<ServerName> serverList = this.rackToRegionServerMap.get(rack);
431     if (serverList == null) return null;
432 
433     // Get a random server except for any servers from the skip set
434     if (skipServerSet != null && serverList.size() <= skipServerSet.size()) {
435       throw new IOException("Cannot randomly pick another random server");
436     }
437 
438     ServerName randomServer;
439     do {
440       int randomIndex = random.nextInt(serverList.size());
441       randomServer = serverList.get(randomIndex);
442     } while (skipServerSet != null && skipServerSet.contains(randomServer));
443 
444     return randomServer;
445   }
446 
447   private ServerName getOneRandomServer(String rack) throws IOException {
448     return this.getOneRandomServer(rack, null);
449   }
450 
451   private String getOneRandomRack(Set<String> skipRackSet) throws IOException {
452     if (skipRackSet == null || uniqueRackList.size() <= skipRackSet.size()) {
453       throw new IOException("Cannot randomly pick another random server");
454     }
455 
456     String randomRack;
457     do {
458       int randomIndex = random.nextInt(this.uniqueRackList.size());
459       randomRack = this.uniqueRackList.get(randomIndex);
460     } while (skipRackSet.contains(randomRack));
461 
462     return randomRack;
463   }
464 }