View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.master;
19  
20  import java.io.IOException;
21  import java.util.ArrayList;
22  import java.util.Collection;
23  import java.util.Collections;
24  import java.util.HashMap;
25  import java.util.HashSet;
26  import java.util.Iterator;
27  import java.util.List;
28  import java.util.Map;
29  import java.util.Set;
30  import java.util.TreeMap;
31  
32  import org.apache.commons.logging.Log;
33  import org.apache.commons.logging.LogFactory;
34  import org.apache.hadoop.hbase.classification.InterfaceAudience;
35  import org.apache.hadoop.conf.Configuration;
36  import org.apache.hadoop.hbase.HConstants;
37  import org.apache.hadoop.hbase.HRegionInfo;
38  import org.apache.hadoop.hbase.RegionTransition;
39  import org.apache.hadoop.hbase.Server;
40  import org.apache.hadoop.hbase.ServerLoad;
41  import org.apache.hadoop.hbase.ServerName;
42  import org.apache.hadoop.hbase.TableName;
43  import org.apache.hadoop.hbase.TableStateManager;
44  import org.apache.hadoop.hbase.client.RegionReplicaUtil;
45  import org.apache.hadoop.hbase.MetaTableAccessor;
46  import org.apache.hadoop.hbase.master.RegionState.State;
47  import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos;
48  import org.apache.hadoop.hbase.util.Bytes;
49  import org.apache.hadoop.hbase.util.FSUtils;
50  import org.apache.hadoop.hbase.util.Pair;
51  import org.apache.hadoop.hbase.zookeeper.ZKAssign;
52  import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
53  import org.apache.zookeeper.KeeperException;
54  
55  import com.google.common.annotations.VisibleForTesting;
56  import com.google.common.base.Preconditions;
57  
58  /**
59   * Region state accountant. It holds the states of all regions in the memory.
60   * In normal scenario, it should match the meta table and the true region states.
61   *
62   * This map is used by AssignmentManager to track region states.
63   */
64  @InterfaceAudience.Private
65  public class RegionStates {
66    private static final Log LOG = LogFactory.getLog(RegionStates.class);
67  
68    /**
69     * Regions currently in transition.
70     */
71    final HashMap<String, RegionState> regionsInTransition =
72      new HashMap<String, RegionState>();
73  
74    /**
75     * Region encoded name to state map.
76     * All the regions should be in this map.
77     */
78    private final Map<String, RegionState> regionStates =
79      new HashMap<String, RegionState>();
80  
81    /**
82     * Server to regions assignment map.
83     * Contains the set of regions currently assigned to a given server.
84     */
85    private final Map<ServerName, Set<HRegionInfo>> serverHoldings =
86      new HashMap<ServerName, Set<HRegionInfo>>();
87  
88    /**
89     * Maintains the mapping from the default region to the replica regions.
90     */
91    private final Map<HRegionInfo, Set<HRegionInfo>> defaultReplicaToOtherReplicas =
92      new HashMap<HRegionInfo, Set<HRegionInfo>>();
93  
94    /**
95     * Region to server assignment map.
96     * Contains the server a given region is currently assigned to.
97     */
98    private final TreeMap<HRegionInfo, ServerName> regionAssignments =
99      new TreeMap<HRegionInfo, ServerName>();
100 
101   /**
102    * Encoded region name to server assignment map for re-assignment
103    * purpose. Contains the server a given region is last known assigned
104    * to, which has not completed log splitting, so not assignable.
105    * If a region is currently assigned, this server info in this
106    * map should be the same as that in regionAssignments.
107    * However the info in regionAssignments is cleared when the region
108    * is offline while the info in lastAssignments is cleared when
109    * the region is closed or the server is dead and processed.
110    */
111   private final HashMap<String, ServerName> lastAssignments =
112     new HashMap<String, ServerName>();
113 
114   /**
115    * Encoded region name to server assignment map for the
116    * purpose to clean up serverHoldings when a region is online
117    * on a new server. When the region is offline from the previous
118    * server, we cleaned up regionAssignments so that it has the
119    * latest assignment map. But we didn't clean up serverHoldings
120    * to match the meta. We need this map to find out the old server
121    * whose serverHoldings needs cleanup, given a moved region.
122    */
123   private final HashMap<String, ServerName> oldAssignments =
124     new HashMap<String, ServerName>();
125 
126   /**
127    * Map a host port pair string to the latest start code
128    * of a region server which is known to be dead. It is dead
129    * to us, but server manager may not know it yet.
130    */
131   private final HashMap<String, Long> deadServers =
132     new HashMap<String, Long>();
133 
134   /**
135    * Map a dead servers to the time when log split is done.
136    * Since log splitting is not ordered, we have to remember
137    * all processed instances. The map is cleaned up based
138    * on a configured time. By default, we assume a dead
139    * server should be done with log splitting in two hours.
140    */
141   private final HashMap<ServerName, Long> processedServers =
142     new HashMap<ServerName, Long>();
143   private long lastProcessedServerCleanTime;
144 
145   private final TableStateManager tableStateManager;
146   private final RegionStateStore regionStateStore;
147   private final ServerManager serverManager;
148   private final Server server;
149 
150   // The maximum time to keep a log split info in region states map
151   static final String LOG_SPLIT_TIME = "hbase.master.maximum.logsplit.keeptime";
152   static final long DEFAULT_LOG_SPLIT_TIME = 7200000L; // 2 hours
153 
154   RegionStates(final Server master, final TableStateManager tableStateManager,
155       final ServerManager serverManager, final RegionStateStore regionStateStore) {
156     this.tableStateManager = tableStateManager;
157     this.regionStateStore = regionStateStore;
158     this.serverManager = serverManager;
159     this.server = master;
160   }
161 
162   /**
163    * @return an unmodifiable the region assignment map
164    */
165   public synchronized Map<HRegionInfo, ServerName> getRegionAssignments() {
166     return Collections.unmodifiableMap(regionAssignments);
167   }
168 
169   /**
170    * Return the replicas (including default) for the regions grouped by ServerName
171    * @param regions
172    * @return a pair containing the groupings as a map
173    */
174   synchronized Map<ServerName, List<HRegionInfo>> getRegionAssignments(
175     Collection<HRegionInfo> regions) {
176     Map<ServerName, List<HRegionInfo>> map = new HashMap<ServerName, List<HRegionInfo>>();
177     for (HRegionInfo region : regions) {
178       HRegionInfo defaultReplica = RegionReplicaUtil.getRegionInfoForDefaultReplica(region);
179       Set<HRegionInfo> allReplicas = defaultReplicaToOtherReplicas.get(defaultReplica);
180       if (allReplicas != null) {
181         for (HRegionInfo hri : allReplicas) {
182           ServerName server = regionAssignments.get(hri);
183           if (server != null) {
184             List<HRegionInfo> regionsOnServer = map.get(server);
185             if (regionsOnServer == null) {
186               regionsOnServer = new ArrayList<HRegionInfo>(1);
187               map.put(server, regionsOnServer);
188             }
189             regionsOnServer.add(hri);
190           }
191         }
192       }
193     }
194     return map;
195   }
196 
197   public synchronized ServerName getRegionServerOfRegion(HRegionInfo hri) {
198     return regionAssignments.get(hri);
199   }
200 
201   /**
202    * Get regions in transition and their states
203    */
204   @SuppressWarnings("unchecked")
205   public synchronized Map<String, RegionState> getRegionsInTransition() {
206     return (Map<String, RegionState>)regionsInTransition.clone();
207   }
208 
209   /**
210    * @return True if specified region in transition.
211    */
212   public synchronized boolean isRegionInTransition(final HRegionInfo hri) {
213     return regionsInTransition.containsKey(hri.getEncodedName());
214   }
215 
216   /**
217    * @return True if specified region in transition.
218    */
219   public synchronized boolean isRegionInTransition(final String encodedName) {
220     return regionsInTransition.containsKey(encodedName);
221   }
222 
223   /**
224    * @return True if any region in transition.
225    */
226   public synchronized boolean isRegionsInTransition() {
227     return !regionsInTransition.isEmpty();
228   }
229 
230   /**
231    * @return True if specified region assigned, and not in transition.
232    */
233   public synchronized boolean isRegionOnline(final HRegionInfo hri) {
234     return !isRegionInTransition(hri) && regionAssignments.containsKey(hri);
235   }
236 
237   /**
238    * @return True if specified region offline/closed, but not in transition.
239    * If the region is not in the map, it is offline to us too.
240    */
241   public synchronized boolean isRegionOffline(final HRegionInfo hri) {
242     return getRegionState(hri) == null || (!isRegionInTransition(hri)
243       && isRegionInState(hri, State.OFFLINE, State.CLOSED));
244   }
245 
246   /**
247    * @return True if specified region is in one of the specified states.
248    */
249   public boolean isRegionInState(
250       final HRegionInfo hri, final State... states) {
251     return isRegionInState(hri.getEncodedName(), states);
252   }
253 
254   /**
255    * @return True if specified region is in one of the specified states.
256    */
257   public boolean isRegionInState(
258       final String encodedName, final State... states) {
259     RegionState regionState = getRegionState(encodedName);
260     return isOneOfStates(regionState, states);
261   }
262 
263   /**
264    * Wait for the state map to be updated by assignment manager.
265    */
266   public synchronized void waitForUpdate(
267       final long timeout) throws InterruptedException {
268     this.wait(timeout);
269   }
270 
271   /**
272    * Get region transition state
273    */
274   public RegionState getRegionTransitionState(final HRegionInfo hri) {
275     return getRegionTransitionState(hri.getEncodedName());
276   }
277 
278   /**
279    * Get region transition state
280    */
281   public synchronized RegionState
282       getRegionTransitionState(final String encodedName) {
283     return regionsInTransition.get(encodedName);
284   }
285 
286   /**
287    * Add a list of regions to RegionStates. If a region is split
288    * and offline, its state will be SPLIT. Otherwise, its state will
289    * be OFFLINE. Region already in RegionStates will be skipped.
290    */
291   public void createRegionStates(
292       final List<HRegionInfo> hris) {
293     for (HRegionInfo hri: hris) {
294       createRegionState(hri);
295     }
296   }
297 
298   /**
299    * Add a region to RegionStates. If the region is split
300    * and offline, its state will be SPLIT. Otherwise, its state will
301    * be OFFLINE. If it is already in RegionStates, this call has
302    * no effect, and the original state is returned.
303    */
304   public RegionState createRegionState(final HRegionInfo hri) {
305     return createRegionState(hri, null, null, null);
306   }
307 
308   /**
309    * Add a region to RegionStates with the specified state.
310    * If the region is already in RegionStates, this call has
311    * no effect, and the original state is returned.
312    *
313    * @param hri the region info to create a state for
314    * @param newState the state to the region in set to
315    * @param serverName the server the region is transitioning on
316    * @param lastHost the last server that hosts the region
317    * @return the current state
318    */
319   public synchronized RegionState createRegionState(final HRegionInfo hri,
320       State newState, ServerName serverName, ServerName lastHost) {
321     if (newState == null || (newState == State.OPEN && serverName == null)) {
322       newState =  State.OFFLINE;
323     }
324     if (hri.isOffline() && hri.isSplit()) {
325       newState = State.SPLIT;
326       serverName = null;
327     }
328     String encodedName = hri.getEncodedName();
329     RegionState regionState = regionStates.get(encodedName);
330     if (regionState != null) {
331       LOG.warn("Tried to create a state for a region already in RegionStates, "
332         + "used existing: " + regionState + ", ignored new: " + newState);
333     } else {
334       regionState = new RegionState(hri, newState, serverName);
335       regionStates.put(encodedName, regionState);
336       if (newState == State.OPEN) {
337         if (!serverName.equals(lastHost)) {
338           LOG.warn("Open region's last host " + lastHost
339             + " should be the same as the current one " + serverName
340             + ", ignored the last and used the current one");
341           lastHost = serverName;
342         }
343         lastAssignments.put(encodedName, lastHost);
344         regionAssignments.put(hri, lastHost);
345       } else if (!regionState.isUnassignable()) {
346         regionsInTransition.put(encodedName, regionState);
347       }
348       if (lastHost != null && newState != State.SPLIT) {
349         addToServerHoldings(lastHost, hri);
350         if (newState != State.OPEN) {
351           oldAssignments.put(encodedName, lastHost);
352         }
353       }
354     }
355     return regionState;
356   }
357 
358   /**
359    * Update a region state. It will be put in transition if not already there.
360    */
361   public RegionState updateRegionState(
362       final HRegionInfo hri, final State state) {
363     RegionState regionState = getRegionState(hri.getEncodedName());
364     return updateRegionState(hri, state,
365       regionState == null ? null : regionState.getServerName());
366   }
367 
368   /**
369    * Update a region state. It will be put in transition if not already there.
370    *
371    * If we can't find the region info based on the region name in
372    * the transition, log a warning and return null.
373    */
374   public RegionState updateRegionState(
375       final RegionTransition transition, final State state) {
376     byte [] regionName = transition.getRegionName();
377     HRegionInfo regionInfo = getRegionInfo(regionName);
378     if (regionInfo == null) {
379       String prettyRegionName = HRegionInfo.prettyPrint(
380         HRegionInfo.encodeRegionName(regionName));
381       LOG.warn("Failed to find region " + prettyRegionName
382         + " in updating its state to " + state
383         + " based on region transition " + transition);
384       return null;
385     }
386     return updateRegionState(regionInfo, state,
387       transition.getServerName());
388   }
389 
390   /**
391    * Transition a region state to OPEN from OPENING/PENDING_OPEN
392    */
393   public synchronized RegionState transitionOpenFromPendingOpenOrOpeningOnServer(
394       final RegionTransition transition, final RegionState fromState, final ServerName sn) {
395     if(fromState.isPendingOpenOrOpeningOnServer(sn)){
396       return updateRegionState(transition, State.OPEN);
397     }
398     return null;
399   }
400 
401   /**
402    * Update a region state. It will be put in transition if not already there.
403    */
404   public RegionState updateRegionState(
405       final HRegionInfo hri, final State state, final ServerName serverName) {
406     return updateRegionState(hri, state, serverName, HConstants.NO_SEQNUM);
407   }
408 
409   public void regionOnline(
410       final HRegionInfo hri, final ServerName serverName) {
411     regionOnline(hri, serverName, HConstants.NO_SEQNUM);
412   }
413 
414   /**
415    * A region is online, won't be in transition any more.
416    * We can't confirm it is really online on specified region server
417    * because it hasn't been put in region server's online region list yet.
418    */
419   public void regionOnline(final HRegionInfo hri,
420       final ServerName serverName, long openSeqNum) {
421     String encodedName = hri.getEncodedName();
422     if (!serverManager.isServerOnline(serverName)) {
423       // This is possible if the region server dies before master gets a
424       // chance to handle ZK event in time. At this time, if the dead server
425       // is already processed by SSH, we should ignore this event.
426       // If not processed yet, ignore and let SSH deal with it.
427       LOG.warn("Ignored, " + encodedName
428         + " was opened on a dead server: " + serverName);
429       return;
430     }
431     updateRegionState(hri, State.OPEN, serverName, openSeqNum);
432 
433     synchronized (this) {
434       regionsInTransition.remove(encodedName);
435       ServerName oldServerName = regionAssignments.put(hri, serverName);
436       if (!serverName.equals(oldServerName)) {
437         LOG.info("Onlined " + hri.getShortNameToLog() + " on " + serverName);
438         addToServerHoldings(serverName, hri);
439         addToReplicaMapping(hri);
440         if (oldServerName == null) {
441           oldServerName = oldAssignments.remove(encodedName);
442         }
443         if (oldServerName != null && serverHoldings.containsKey(oldServerName)) {
444           LOG.info("Offlined " + hri.getShortNameToLog() + " from " + oldServerName);
445           removeFromServerHoldings(oldServerName, hri);
446         }
447       }
448     }
449   }
450 
451   private void addToServerHoldings(ServerName serverName, HRegionInfo hri) {
452     Set<HRegionInfo> regions = serverHoldings.get(serverName);
453     if (regions == null) {
454       regions = new HashSet<HRegionInfo>();
455       serverHoldings.put(serverName, regions);
456     }
457     regions.add(hri);
458   }
459 
460   private void addToReplicaMapping(HRegionInfo hri) {
461     HRegionInfo defaultReplica = RegionReplicaUtil.getRegionInfoForDefaultReplica(hri);
462     Set<HRegionInfo> replicas =
463         defaultReplicaToOtherReplicas.get(defaultReplica);
464     if (replicas == null) {
465       replicas = new HashSet<HRegionInfo>();
466       defaultReplicaToOtherReplicas.put(defaultReplica, replicas);
467     }
468     replicas.add(hri);
469   }
470 
471   private void removeFromServerHoldings(ServerName serverName, HRegionInfo hri) {
472     Set<HRegionInfo> oldRegions = serverHoldings.get(serverName);
473     oldRegions.remove(hri);
474     if (oldRegions.isEmpty()) {
475       serverHoldings.remove(serverName);
476     }
477   }
478 
479   private void removeFromReplicaMapping(HRegionInfo hri) {
480     HRegionInfo defaultReplica = RegionReplicaUtil.getRegionInfoForDefaultReplica(hri);
481     Set<HRegionInfo> replicas = defaultReplicaToOtherReplicas.get(defaultReplica);
482     if (replicas != null) {
483       replicas.remove(hri);
484       if (replicas.isEmpty()) {
485         defaultReplicaToOtherReplicas.remove(defaultReplica);
486       }
487     }
488   }
489 
490   /**
491    * A dead server's wals have been split so that all the regions
492    * used to be open on it can be safely assigned now. Mark them assignable.
493    */
494   public synchronized void logSplit(final ServerName serverName) {
495     for (Iterator<Map.Entry<String, ServerName>> it
496         = lastAssignments.entrySet().iterator(); it.hasNext();) {
497       Map.Entry<String, ServerName> e = it.next();
498       if (e.getValue().equals(serverName)) {
499         it.remove();
500       }
501     }
502     long now = System.currentTimeMillis();
503     if (LOG.isDebugEnabled()) {
504       LOG.debug("Adding to processed servers " + serverName);
505     }
506     processedServers.put(serverName, Long.valueOf(now));
507     Configuration conf = server.getConfiguration();
508     long obsoleteTime = conf.getLong(LOG_SPLIT_TIME, DEFAULT_LOG_SPLIT_TIME);
509     // Doesn't have to be very accurate about the clean up time
510     if (now > lastProcessedServerCleanTime + obsoleteTime) {
511       lastProcessedServerCleanTime = now;
512       long cutoff = now - obsoleteTime;
513       for (Iterator<Map.Entry<ServerName, Long>> it
514           = processedServers.entrySet().iterator(); it.hasNext();) {
515         Map.Entry<ServerName, Long> e = it.next();
516         if (e.getValue().longValue() < cutoff) {
517           if (LOG.isDebugEnabled()) {
518             LOG.debug("Removed from processed servers " + e.getKey());
519           }
520           it.remove();
521         }
522       }
523     }
524   }
525 
526   /**
527    * Log split is done for a given region, so it is assignable now.
528    */
529   public void logSplit(final HRegionInfo region) {
530     clearLastAssignment(region);
531   }
532 
533   public synchronized void clearLastAssignment(final HRegionInfo region) {
534     lastAssignments.remove(region.getEncodedName());
535   }
536 
537   /**
538    * A region is offline, won't be in transition any more.
539    */
540   public void regionOffline(final HRegionInfo hri) {
541     regionOffline(hri, null);
542   }
543 
544   /**
545    * A region is offline, won't be in transition any more. Its state
546    * should be the specified expected state, which can only be
547    * Split/Merged/Offline/null(=Offline)/SplittingNew/MergingNew.
548    */
549   public void regionOffline(
550       final HRegionInfo hri, final State expectedState) {
551     Preconditions.checkArgument(expectedState == null
552       || RegionState.isUnassignable(expectedState),
553         "Offlined region should not be " + expectedState);
554     if (isRegionInState(hri, State.SPLITTING_NEW, State.MERGING_NEW)) {
555       // Remove it from all region maps
556       deleteRegion(hri);
557       return;
558     }
559     State newState =
560       expectedState == null ? State.OFFLINE : expectedState;
561     updateRegionState(hri, newState);
562     String encodedName = hri.getEncodedName();
563     synchronized (this) {
564       regionsInTransition.remove(encodedName);
565       ServerName oldServerName = regionAssignments.remove(hri);
566       if (oldServerName != null && serverHoldings.containsKey(oldServerName)) {
567         if (newState == State.MERGED || newState == State.SPLIT
568             || hri.isMetaRegion() || tableStateManager.isTableState(hri.getTable(),
569               ZooKeeperProtos.Table.State.DISABLED, ZooKeeperProtos.Table.State.DISABLING)) {
570           // Offline the region only if it's merged/split, or the table is disabled/disabling.
571           // Otherwise, offline it from this server only when it is online on a different server.
572           LOG.info("Offlined " + hri.getShortNameToLog() + " from " + oldServerName);
573           removeFromServerHoldings(oldServerName, hri);
574           removeFromReplicaMapping(hri);
575         } else {
576           // Need to remember it so that we can offline it from this
577           // server when it is online on a different server.
578           oldAssignments.put(encodedName, oldServerName);
579         }
580       }
581     }
582   }
583 
584   /**
585    * A server is offline, all regions on it are dead.
586    */
587   public List<HRegionInfo> serverOffline(final ZooKeeperWatcher watcher, final ServerName sn) {
588     // Offline all regions on this server not already in transition.
589     List<HRegionInfo> rits = new ArrayList<HRegionInfo>();
590     Set<HRegionInfo> regionsToCleanIfNoMetaEntry = new HashSet<HRegionInfo>();
591     synchronized (this) {
592       Set<HRegionInfo> assignedRegions = serverHoldings.get(sn);
593       if (assignedRegions == null) {
594         assignedRegions = new HashSet<HRegionInfo>();
595       }
596 
597       // Offline regions outside the loop to avoid ConcurrentModificationException
598       Set<HRegionInfo> regionsToOffline = new HashSet<HRegionInfo>();
599       for (HRegionInfo region : assignedRegions) {
600         // Offline open regions, no need to offline if SPLIT/MERGED/OFFLINE
601         if (isRegionOnline(region)) {
602           regionsToOffline.add(region);
603         } else if (isRegionInState(region, State.SPLITTING, State.MERGING)) {
604           LOG.debug("Offline splitting/merging region " + getRegionState(region));
605           try {
606             // Delete the ZNode if exists
607             ZKAssign.deleteNodeFailSilent(watcher, region);
608             regionsToOffline.add(region);
609           } catch (KeeperException ke) {
610             server.abort("Unexpected ZK exception deleting node " + region, ke);
611           }
612         }
613       }
614 
615       for (RegionState state : regionsInTransition.values()) {
616         HRegionInfo hri = state.getRegion();
617         if (assignedRegions.contains(hri)) {
618           // Region is open on this region server, but in transition.
619           // This region must be moving away from this server, or splitting/merging.
620           // SSH will handle it, either skip assigning, or re-assign.
621           LOG.info("Transitioning " + state + " will be handled by SSH for " + sn);
622         } else if (sn.equals(state.getServerName())) {
623           // Region is in transition on this region server, and this
624           // region is not open on this server. So the region must be
625           // moving to this server from another one (i.e. opening or
626           // pending open on this server, was open on another one.
627           // Offline state is also kind of pending open if the region is in
628           // transition. The region could be in failed_close state too if we have
629           // tried several times to open it while this region server is not reachable)
630           if (state.isPendingOpenOrOpening() || state.isFailedClose() || state.isOffline()) {
631             LOG.info("Found region in " + state + " to be reassigned by SSH for " + sn);
632             rits.add(hri);
633           } else if(state.isSplittingNew()) {
634             regionsToCleanIfNoMetaEntry.add(state.getRegion());
635           } else {
636             LOG.warn("THIS SHOULD NOT HAPPEN: unexpected " + state);
637           }
638         }
639       }
640 
641       for (HRegionInfo hri : regionsToOffline) {
642         regionOffline(hri);
643       }
644 
645       this.notifyAll();
646     }
647     cleanIfNoMetaEntry(regionsToCleanIfNoMetaEntry);
648     return rits;
649   }
650 
651   /**
652    * This method does an RPC to hbase:meta. Do not call this method with a lock/synchronize held.
653    * @param hris The hris to check if empty in hbase:meta and if so, clean them up.
654    */
655   private void cleanIfNoMetaEntry(Set<HRegionInfo> hris) {
656     if (hris.isEmpty()) return;
657     for (HRegionInfo hri: hris) {
658       try {
659         // This is RPC to meta table. It is done while we have a synchronize on
660         // regionstates. No progress will be made if meta is not available at this time.
661         // This is a cleanup task. Not critical.
662         if (MetaTableAccessor.getRegion(server.getConnection(), hri.getEncodedNameAsBytes()) ==
663             null) {
664           regionOffline(hri);
665           FSUtils.deleteRegionDir(server.getConfiguration(), hri);
666         }
667       } catch (IOException e) {
668         LOG.warn("Got exception while deleting " + hri + " directories from file system.", e);
669       }
670     }
671   }
672 
673   /**
674    * Gets the online regions of the specified table.
675    * This method looks at the in-memory state.  It does not go to <code>hbase:meta</code>.
676    * Only returns <em>online</em> regions.  If a region on this table has been
677    * closed during a disable, etc., it will be included in the returned list.
678    * So, the returned list may not necessarily be ALL regions in this table, its
679    * all the ONLINE regions in the table.
680    * @param tableName
681    * @return Online regions from <code>tableName</code>
682    */
683   public synchronized List<HRegionInfo> getRegionsOfTable(TableName tableName) {
684     List<HRegionInfo> tableRegions = new ArrayList<HRegionInfo>();
685     // boundary needs to have table's name but regionID 0 so that it is sorted
686     // before all table's regions.
687     HRegionInfo boundary = new HRegionInfo(tableName, null, null, false, 0L);
688     for (HRegionInfo hri: regionAssignments.tailMap(boundary).keySet()) {
689       if(!hri.getTable().equals(tableName)) break;
690       tableRegions.add(hri);
691     }
692     return tableRegions;
693   }
694 
695 
696   /**
697    * Wait on region to clear regions-in-transition.
698    * <p>
699    * If the region isn't in transition, returns immediately.  Otherwise, method
700    * blocks until the region is out of transition.
701    */
702   public synchronized void waitOnRegionToClearRegionsInTransition(
703       final HRegionInfo hri) throws InterruptedException {
704     if (!isRegionInTransition(hri)) return;
705 
706     while(!server.isStopped() && isRegionInTransition(hri)) {
707       RegionState rs = getRegionState(hri);
708       LOG.info("Waiting on " + rs + " to clear regions-in-transition");
709       waitForUpdate(100);
710     }
711 
712     if (server.isStopped()) {
713       LOG.info("Giving up wait on region in " +
714         "transition because stoppable.isStopped is set");
715     }
716   }
717 
718   /**
719    * A table is deleted. Remove its regions from all internal maps.
720    * We loop through all regions assuming we don't delete tables too much.
721    */
722   public void tableDeleted(final TableName tableName) {
723     Set<HRegionInfo> regionsToDelete = new HashSet<HRegionInfo>();
724     synchronized (this) {
725       for (RegionState state: regionStates.values()) {
726         HRegionInfo region = state.getRegion();
727         if (region.getTable().equals(tableName)) {
728           regionsToDelete.add(region);
729         }
730       }
731     }
732     for (HRegionInfo region: regionsToDelete) {
733       deleteRegion(region);
734     }
735   }
736 
737   /**
738    * Get a copy of all regions assigned to a server
739    */
740   public synchronized Set<HRegionInfo> getServerRegions(ServerName serverName) {
741     Set<HRegionInfo> regions = serverHoldings.get(serverName);
742     if (regions == null) return null;
743     return new HashSet<HRegionInfo>(regions);
744   }
745 
746   /**
747    * Remove a region from all state maps.
748    */
749   @VisibleForTesting
750   public synchronized void deleteRegion(final HRegionInfo hri) {
751     String encodedName = hri.getEncodedName();
752     regionsInTransition.remove(encodedName);
753     regionStates.remove(encodedName);
754     lastAssignments.remove(encodedName);
755     ServerName sn = regionAssignments.remove(hri);
756     if (sn != null) {
757       Set<HRegionInfo> regions = serverHoldings.get(sn);
758       regions.remove(hri);
759     }
760   }
761 
762   /**
763    * Checking if a region was assigned to a server which is not online now.
764    * If so, we should hold re-assign this region till SSH has split its wals.
765    * Once logs are split, the last assignment of this region will be reset,
766    * which means a null last assignment server is ok for re-assigning.
767    *
768    * A region server could be dead but we don't know it yet. We may
769    * think it's online falsely. Therefore if a server is online, we still
770    * need to confirm it reachable and having the expected start code.
771    */
772   synchronized boolean wasRegionOnDeadServer(final String encodedName) {
773     ServerName server = lastAssignments.get(encodedName);
774     return isServerDeadAndNotProcessed(server);
775   }
776 
777   synchronized boolean isServerDeadAndNotProcessed(ServerName server) {
778     if (server == null) return false;
779     if (serverManager.isServerOnline(server)) {
780       String hostAndPort = server.getHostAndPort();
781       long startCode = server.getStartcode();
782       Long deadCode = deadServers.get(hostAndPort);
783       if (deadCode == null || startCode > deadCode.longValue()) {
784         if (serverManager.isServerReachable(server)) {
785           return false;
786         }
787         // The size of deadServers won't grow unbounded.
788         deadServers.put(hostAndPort, Long.valueOf(startCode));
789       }
790       // Watch out! If the server is not dead, the region could
791       // remain unassigned. That's why ServerManager#isServerReachable
792       // should use some retry.
793       //
794       // We cache this info since it is very unlikely for that
795       // instance to come back up later on. We don't want to expire
796       // the server since we prefer to let it die naturally.
797       LOG.warn("Couldn't reach online server " + server);
798     }
799     // Now, we know it's dead. Check if it's processed
800     return !processedServers.containsKey(server);
801   }
802 
803  /**
804    * Get the last region server a region was on for purpose of re-assignment,
805    * i.e. should the re-assignment be held back till log split is done?
806    */
807   synchronized ServerName getLastRegionServerOfRegion(final String encodedName) {
808     return lastAssignments.get(encodedName);
809   }
810 
811   synchronized void setLastRegionServerOfRegions(
812       final ServerName serverName, final List<HRegionInfo> regionInfos) {
813     for (HRegionInfo hri: regionInfos) {
814       setLastRegionServerOfRegion(serverName, hri.getEncodedName());
815     }
816   }
817 
818   synchronized void setLastRegionServerOfRegion(
819       final ServerName serverName, final String encodedName) {
820     lastAssignments.put(encodedName, serverName);
821   }
822 
823   void splitRegion(HRegionInfo p,
824       HRegionInfo a, HRegionInfo b, ServerName sn) throws IOException {
825     regionStateStore.splitRegion(p, a, b, sn);
826     synchronized (this) {
827       // After PONR, split is considered to be done.
828       // Update server holdings to be aligned with the meta.
829       Set<HRegionInfo> regions = serverHoldings.get(sn);
830       if (regions == null) {
831         throw new IllegalStateException(sn + " should host some regions");
832       }
833       regions.remove(p);
834       regions.add(a);
835       regions.add(b);
836     }
837   }
838 
839   void mergeRegions(HRegionInfo p,
840       HRegionInfo a, HRegionInfo b, ServerName sn) throws IOException {
841     regionStateStore.mergeRegions(p, a, b, sn);
842     synchronized (this) {
843       // After PONR, merge is considered to be done.
844       // Update server holdings to be aligned with the meta.
845       Set<HRegionInfo> regions = serverHoldings.get(sn);
846       if (regions == null) {
847         throw new IllegalStateException(sn + " should host some regions");
848       }
849       regions.remove(a);
850       regions.remove(b);
851       regions.add(p);
852     }
853   }
854 
855   /**
856    * At cluster clean re/start, mark all user regions closed except those of tables
857    * that are excluded, such as disabled/disabling/enabling tables. All user regions
858    * and their previous locations are returned.
859    */
860   synchronized Map<HRegionInfo, ServerName> closeAllUserRegions(Set<TableName> excludedTables) {
861     boolean noExcludeTables = excludedTables == null || excludedTables.isEmpty();
862     Set<HRegionInfo> toBeClosed = new HashSet<HRegionInfo>(regionStates.size());
863     for(RegionState state: regionStates.values()) {
864       HRegionInfo hri = state.getRegion();
865       if (state.isSplit() || hri.isSplit()) {
866         continue;
867       }
868       TableName tableName = hri.getTable();
869       if (!TableName.META_TABLE_NAME.equals(tableName)
870           && (noExcludeTables || !excludedTables.contains(tableName))) {
871         toBeClosed.add(hri);
872       }
873     }
874     Map<HRegionInfo, ServerName> allUserRegions =
875       new HashMap<HRegionInfo, ServerName>(toBeClosed.size());
876     for (HRegionInfo hri: toBeClosed) {
877       RegionState regionState = updateRegionState(hri, State.CLOSED);
878       allUserRegions.put(hri, regionState.getServerName());
879     }
880     return allUserRegions;
881   }
882 
883   /**
884    * Compute the average load across all region servers.
885    * Currently, this uses a very naive computation - just uses the number of
886    * regions being served, ignoring stats about number of requests.
887    * @return the average load
888    */
889   protected synchronized double getAverageLoad() {
890     int numServers = 0, totalLoad = 0;
891     for (Map.Entry<ServerName, Set<HRegionInfo>> e: serverHoldings.entrySet()) {
892       Set<HRegionInfo> regions = e.getValue();
893       ServerName serverName = e.getKey();
894       int regionCount = regions.size();
895       if (serverManager.isServerOnline(serverName)) {
896         totalLoad += regionCount;
897         numServers++;
898       }
899     }
900     if (numServers > 1) {
901       // The master region server holds only a couple regions.
902       // Don't consider this server in calculating the average load
903       // if there are other region servers to avoid possible confusion.
904       Set<HRegionInfo> hris = serverHoldings.get(server.getServerName());
905       if (hris != null) {
906         totalLoad -= hris.size();
907         numServers--;
908       }
909     }
910     return numServers == 0 ? 0.0 :
911       (double)totalLoad / (double)numServers;
912   }
913 
914   /**
915    * This is an EXPENSIVE clone.  Cloning though is the safest thing to do.
916    * Can't let out original since it can change and at least the load balancer
917    * wants to iterate this exported list.  We need to synchronize on regions
918    * since all access to this.servers is under a lock on this.regions.
919    *
920    * @return A clone of current assignments by table.
921    */
922   protected Map<TableName, Map<ServerName, List<HRegionInfo>>>
923       getAssignmentsByTable() {
924     Map<TableName, Map<ServerName, List<HRegionInfo>>> result =
925       new HashMap<TableName, Map<ServerName,List<HRegionInfo>>>();
926     synchronized (this) {
927       if (!server.getConfiguration().getBoolean("hbase.master.loadbalance.bytable", false)) {
928         Map<ServerName, List<HRegionInfo>> svrToRegions =
929           new HashMap<ServerName, List<HRegionInfo>>(serverHoldings.size());
930         for (Map.Entry<ServerName, Set<HRegionInfo>> e: serverHoldings.entrySet()) {
931           svrToRegions.put(e.getKey(), new ArrayList<HRegionInfo>(e.getValue()));
932         }
933         result.put(TableName.valueOf("ensemble"), svrToRegions);
934       } else {
935         for (Map.Entry<ServerName, Set<HRegionInfo>> e: serverHoldings.entrySet()) {
936           for (HRegionInfo hri: e.getValue()) {
937             if (hri.isMetaRegion()) continue;
938             TableName tablename = hri.getTable();
939             Map<ServerName, List<HRegionInfo>> svrToRegions = result.get(tablename);
940             if (svrToRegions == null) {
941               svrToRegions = new HashMap<ServerName, List<HRegionInfo>>(serverHoldings.size());
942               result.put(tablename, svrToRegions);
943             }
944             List<HRegionInfo> regions = svrToRegions.get(e.getKey());
945             if (regions == null) {
946               regions = new ArrayList<HRegionInfo>();
947               svrToRegions.put(e.getKey(), regions);
948             }
949             regions.add(hri);
950           }
951         }
952       }
953     }
954 
955     Map<ServerName, ServerLoad>
956       onlineSvrs = serverManager.getOnlineServers();
957     // Take care of servers w/o assignments, and remove servers in draining mode
958     List<ServerName> drainingServers = this.serverManager.getDrainingServersList();
959     for (Map<ServerName, List<HRegionInfo>> map: result.values()) {
960       for (ServerName svr: onlineSvrs.keySet()) {
961         if (!map.containsKey(svr)) {
962           map.put(svr, new ArrayList<HRegionInfo>());
963         }
964       }
965       map.keySet().removeAll(drainingServers);
966     }
967     return result;
968   }
969 
970   protected RegionState getRegionState(final HRegionInfo hri) {
971     return getRegionState(hri.getEncodedName());
972   }
973 
974   /**
975    * Returns a clone of region assignments per server
976    * @return a Map of ServerName to a List of HRegionInfo's
977    */
978   protected synchronized Map<ServerName, List<HRegionInfo>> getRegionAssignmentsByServer() {
979     Map<ServerName, List<HRegionInfo>> regionsByServer =
980         new HashMap<ServerName, List<HRegionInfo>>(serverHoldings.size());
981     for (Map.Entry<ServerName, Set<HRegionInfo>> e: serverHoldings.entrySet()) {
982       regionsByServer.put(e.getKey(), new ArrayList<HRegionInfo>(e.getValue()));
983     }
984     return regionsByServer;
985   }
986 
987   protected synchronized RegionState getRegionState(final String encodedName) {
988     return regionStates.get(encodedName);
989   }
990 
991   /**
992    * Get the HRegionInfo from cache, if not there, from the hbase:meta table
993    * @param  regionName
994    * @return HRegionInfo for the region
995    */
996   @SuppressWarnings("deprecation")
997   protected HRegionInfo getRegionInfo(final byte [] regionName) {
998     String encodedName = HRegionInfo.encodeRegionName(regionName);
999     RegionState regionState = getRegionState(encodedName);
1000     if (regionState != null) {
1001       return regionState.getRegion();
1002     }
1003 
1004     try {
1005       Pair<HRegionInfo, ServerName> p =
1006         MetaTableAccessor.getRegion(server.getConnection(), regionName);
1007       HRegionInfo hri = p == null ? null : p.getFirst();
1008       if (hri != null) {
1009         createRegionState(hri);
1010       }
1011       return hri;
1012     } catch (IOException e) {
1013       server.abort("Aborting because error occoured while reading "
1014         + Bytes.toStringBinary(regionName) + " from hbase:meta", e);
1015       return null;
1016     }
1017   }
1018 
1019   static boolean isOneOfStates(RegionState regionState, State... states) {
1020     State s = regionState != null ? regionState.getState() : null;
1021     for (State state: states) {
1022       if (s == state) return true;
1023     }
1024     return false;
1025   }
1026 
1027   /**
1028    * Update a region state. It will be put in transition if not already there.
1029    */
1030   private RegionState updateRegionState(final HRegionInfo hri,
1031       final State state, final ServerName serverName, long openSeqNum) {
1032     if (state == State.FAILED_CLOSE || state == State.FAILED_OPEN) {
1033       LOG.warn("Failed to open/close " + hri.getShortNameToLog()
1034         + " on " + serverName + ", set to " + state);
1035     }
1036 
1037     String encodedName = hri.getEncodedName();
1038     RegionState regionState = new RegionState(
1039       hri, state, System.currentTimeMillis(), serverName);
1040     RegionState oldState = getRegionState(encodedName);
1041     if (!regionState.equals(oldState)) {
1042       LOG.info("Transition " + oldState + " to " + regionState);
1043       // Persist region state before updating in-memory info, if needed
1044       regionStateStore.updateRegionState(openSeqNum, regionState, oldState);
1045     }
1046 
1047     synchronized (this) {
1048       regionsInTransition.put(encodedName, regionState);
1049       regionStates.put(encodedName, regionState);
1050 
1051       // For these states, region should be properly closed.
1052       // There should be no log splitting issue.
1053       if ((state == State.CLOSED || state == State.MERGED
1054           || state == State.SPLIT) && lastAssignments.containsKey(encodedName)) {
1055         ServerName last = lastAssignments.get(encodedName);
1056         if (last.equals(serverName)) {
1057           lastAssignments.remove(encodedName);
1058         } else {
1059           LOG.warn(encodedName + " moved to " + state + " on "
1060             + serverName + ", expected " + last);
1061         }
1062       }
1063 
1064       // Once a region is opened, record its last assignment right away.
1065       if (serverName != null && state == State.OPEN) {
1066         ServerName last = lastAssignments.get(encodedName);
1067         if (!serverName.equals(last)) {
1068           lastAssignments.put(encodedName, serverName);
1069           if (last != null && isServerDeadAndNotProcessed(last)) {
1070             LOG.warn(encodedName + " moved to " + serverName
1071               + ", while it's previous host " + last
1072               + " is dead but not processed yet");
1073           }
1074         }
1075       }
1076 
1077       // notify the change
1078       this.notifyAll();
1079     }
1080     return regionState;
1081   }
1082 }