View Javadoc

1   /**
2    * Licensed to the Apache Software Foundation (ASF) under one
3    * or more contributor license agreements.  See the NOTICE file
4    * distributed with this work for additional information
5    * regarding copyright ownership.  The ASF licenses this file
6    * to you under the Apache License, Version 2.0 (the
7    * "License"); you may not use this file except in compliance
8    * with the License.  You may obtain a copy of the License at
9    *
10   *     http://www.apache.org/licenses/LICENSE-2.0
11   *
12   * Unless required by applicable law or agreed to in writing, software
13   * distributed under the License is distributed on an "AS IS" BASIS,
14   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15   * See the License for the specific language governing permissions and
16   * limitations under the License.
17   */
18  package org.apache.hadoop.hbase.master;
19  
20  import java.io.IOException;
21  import java.util.ArrayList;
22  import java.util.Collection;
23  import java.util.Collections;
24  import java.util.HashMap;
25  import java.util.HashSet;
26  import java.util.Iterator;
27  import java.util.List;
28  import java.util.Map;
29  import java.util.Set;
30  import java.util.TreeMap;
31  
32  import org.apache.commons.logging.Log;
33  import org.apache.commons.logging.LogFactory;
34  import org.apache.hadoop.hbase.classification.InterfaceAudience;
35  import org.apache.hadoop.conf.Configuration;
36  import org.apache.hadoop.hbase.HConstants;
37  import org.apache.hadoop.hbase.HRegionInfo;
38  import org.apache.hadoop.hbase.RegionTransition;
39  import org.apache.hadoop.hbase.Server;
40  import org.apache.hadoop.hbase.ServerLoad;
41  import org.apache.hadoop.hbase.ServerName;
42  import org.apache.hadoop.hbase.TableName;
43  import org.apache.hadoop.hbase.TableStateManager;
44  import org.apache.hadoop.hbase.client.RegionReplicaUtil;
45  import org.apache.hadoop.hbase.MetaTableAccessor;
46  import org.apache.hadoop.hbase.master.RegionState.State;
47  import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos;
48  import org.apache.hadoop.hbase.util.Bytes;
49  import org.apache.hadoop.hbase.util.FSUtils;
50  import org.apache.hadoop.hbase.util.Pair;
51  import org.apache.hadoop.hbase.zookeeper.ZKAssign;
52  import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
53  import org.apache.zookeeper.KeeperException;
54  
55  import com.google.common.annotations.VisibleForTesting;
56  import com.google.common.base.Preconditions;
57  
58  /**
59   * Region state accountant. It holds the states of all regions in the memory.
60   * In normal scenario, it should match the meta table and the true region states.
61   *
62   * This map is used by AssignmentManager to track region states.
63   */
64  @InterfaceAudience.Private
65  public class RegionStates {
66    private static final Log LOG = LogFactory.getLog(RegionStates.class);
67  
68    /**
69     * Regions currently in transition.
70     */
71    final HashMap<String, RegionState> regionsInTransition =
72      new HashMap<String, RegionState>();
73  
74    /**
75     * Region encoded name to state map.
76     * All the regions should be in this map.
77     */
78    private final Map<String, RegionState> regionStates =
79      new HashMap<String, RegionState>();
80  
81    /**
82     * Server to regions assignment map.
83     * Contains the set of regions currently assigned to a given server.
84     */
85    private final Map<ServerName, Set<HRegionInfo>> serverHoldings =
86      new HashMap<ServerName, Set<HRegionInfo>>();
87  
88    /**
89     * Maintains the mapping from the default region to the replica regions.
90     */
91    private final Map<HRegionInfo, Set<HRegionInfo>> defaultReplicaToOtherReplicas =
92      new HashMap<HRegionInfo, Set<HRegionInfo>>();
93  
94    /**
95     * Region to server assignment map.
96     * Contains the server a given region is currently assigned to.
97     */
98    private final TreeMap<HRegionInfo, ServerName> regionAssignments =
99      new TreeMap<HRegionInfo, ServerName>();
100 
101   /**
102    * Encoded region name to server assignment map for re-assignment
103    * purpose. Contains the server a given region is last known assigned
104    * to, which has not completed log splitting, so not assignable.
105    * If a region is currently assigned, this server info in this
106    * map should be the same as that in regionAssignments.
107    * However the info in regionAssignments is cleared when the region
108    * is offline while the info in lastAssignments is cleared when
109    * the region is closed or the server is dead and processed.
110    */
111   private final HashMap<String, ServerName> lastAssignments =
112     new HashMap<String, ServerName>();
113 
114   /**
115    * Encoded region name to server assignment map for the
116    * purpose to clean up serverHoldings when a region is online
117    * on a new server. When the region is offline from the previous
118    * server, we cleaned up regionAssignments so that it has the
119    * latest assignment map. But we didn't clean up serverHoldings
120    * to match the meta. We need this map to find out the old server
121    * whose serverHoldings needs cleanup, given a moved region.
122    */
123   private final HashMap<String, ServerName> oldAssignments =
124     new HashMap<String, ServerName>();
125 
126   /**
127    * Map a host port pair string to the latest start code
128    * of a region server which is known to be dead. It is dead
129    * to us, but server manager may not know it yet.
130    */
131   private final HashMap<String, Long> deadServers =
132     new HashMap<String, Long>();
133 
134   /**
135    * Map a dead servers to the time when log split is done.
136    * Since log splitting is not ordered, we have to remember
137    * all processed instances. The map is cleaned up based
138    * on a configured time. By default, we assume a dead
139    * server should be done with log splitting in two hours.
140    */
141   private final HashMap<ServerName, Long> processedServers =
142     new HashMap<ServerName, Long>();
143   private long lastProcessedServerCleanTime;
144 
145   private final TableStateManager tableStateManager;
146   private final RegionStateStore regionStateStore;
147   private final ServerManager serverManager;
148   private final Server server;
149 
150   // The maximum time to keep a log split info in region states map
151   static final String LOG_SPLIT_TIME = "hbase.master.maximum.logsplit.keeptime";
152   static final long DEFAULT_LOG_SPLIT_TIME = 7200000L; // 2 hours
153 
154   RegionStates(final Server master, final TableStateManager tableStateManager,
155       final ServerManager serverManager, final RegionStateStore regionStateStore) {
156     this.tableStateManager = tableStateManager;
157     this.regionStateStore = regionStateStore;
158     this.serverManager = serverManager;
159     this.server = master;
160   }
161 
162   /**
163    * @return an unmodifiable the region assignment map
164    */
165   public synchronized Map<HRegionInfo, ServerName> getRegionAssignments() {
166     return Collections.unmodifiableMap(regionAssignments);
167   }
168 
169   /**
170    * Return the replicas (including default) for the regions grouped by ServerName
171    * @param regions
172    * @return a pair containing the groupings as a map
173    */
174   synchronized Map<ServerName, List<HRegionInfo>> getRegionAssignments(
175     Collection<HRegionInfo> regions) {
176     Map<ServerName, List<HRegionInfo>> map = new HashMap<ServerName, List<HRegionInfo>>();
177     for (HRegionInfo region : regions) {
178       HRegionInfo defaultReplica = RegionReplicaUtil.getRegionInfoForDefaultReplica(region);
179       Set<HRegionInfo> allReplicas = defaultReplicaToOtherReplicas.get(defaultReplica);
180       if (allReplicas != null) {
181         for (HRegionInfo hri : allReplicas) {
182           ServerName server = regionAssignments.get(hri);
183           if (server != null) {
184             List<HRegionInfo> regionsOnServer = map.get(server);
185             if (regionsOnServer == null) {
186               regionsOnServer = new ArrayList<HRegionInfo>(1);
187               map.put(server, regionsOnServer);
188             }
189             regionsOnServer.add(hri);
190           }
191         }
192       }
193     }
194     return map;
195   }
196 
197   public synchronized ServerName getRegionServerOfRegion(HRegionInfo hri) {
198     return regionAssignments.get(hri);
199   }
200 
201   /**
202    * Get regions in transition and their states
203    */
204   @SuppressWarnings("unchecked")
205   public synchronized Map<String, RegionState> getRegionsInTransition() {
206     return (Map<String, RegionState>)regionsInTransition.clone();
207   }
208 
209   /**
210    * @return True if specified region in transition.
211    */
212   public synchronized boolean isRegionInTransition(final HRegionInfo hri) {
213     return regionsInTransition.containsKey(hri.getEncodedName());
214   }
215 
216   /**
217    * @return True if specified region in transition.
218    */
219   public synchronized boolean isRegionInTransition(final String encodedName) {
220     return regionsInTransition.containsKey(encodedName);
221   }
222 
223   /**
224    * @return True if any region in transition.
225    */
226   public synchronized boolean isRegionsInTransition() {
227     return !regionsInTransition.isEmpty();
228   }
229 
230   /**
231    * @return True if specified region assigned, and not in transition.
232    */
233   public synchronized boolean isRegionOnline(final HRegionInfo hri) {
234     return !isRegionInTransition(hri) && regionAssignments.containsKey(hri);
235   }
236 
237   /**
238    * @return True if specified region offline/closed, but not in transition.
239    * If the region is not in the map, it is offline to us too.
240    */
241   public synchronized boolean isRegionOffline(final HRegionInfo hri) {
242     return getRegionState(hri) == null || (!isRegionInTransition(hri)
243       && isRegionInState(hri, State.OFFLINE, State.CLOSED));
244   }
245 
246   /**
247    * @return True if specified region is in one of the specified states.
248    */
249   public boolean isRegionInState(
250       final HRegionInfo hri, final State... states) {
251     return isRegionInState(hri.getEncodedName(), states);
252   }
253 
254   /**
255    * @return True if specified region is in one of the specified states.
256    */
257   public boolean isRegionInState(
258       final String encodedName, final State... states) {
259     RegionState regionState = getRegionState(encodedName);
260     return isOneOfStates(regionState, states);
261   }
262 
263   /**
264    * Wait for the state map to be updated by assignment manager.
265    */
266   public synchronized void waitForUpdate(
267       final long timeout) throws InterruptedException {
268     this.wait(timeout);
269   }
270 
271   /**
272    * Get region transition state
273    */
274   public RegionState getRegionTransitionState(final HRegionInfo hri) {
275     return getRegionTransitionState(hri.getEncodedName());
276   }
277 
278   /**
279    * Get region transition state
280    */
281   public synchronized RegionState
282       getRegionTransitionState(final String encodedName) {
283     return regionsInTransition.get(encodedName);
284   }
285 
286   /**
287    * Add a list of regions to RegionStates. If a region is split
288    * and offline, its state will be SPLIT. Otherwise, its state will
289    * be OFFLINE. Region already in RegionStates will be skipped.
290    */
291   public void createRegionStates(
292       final List<HRegionInfo> hris) {
293     for (HRegionInfo hri: hris) {
294       createRegionState(hri);
295     }
296   }
297 
298   /**
299    * Add a region to RegionStates. If the region is split
300    * and offline, its state will be SPLIT. Otherwise, its state will
301    * be OFFLINE. If it is already in RegionStates, this call has
302    * no effect, and the original state is returned.
303    */
304   public RegionState createRegionState(final HRegionInfo hri) {
305     return createRegionState(hri, null, null, null);
306   }
307 
308   /**
309    * Add a region to RegionStates with the specified state.
310    * If the region is already in RegionStates, this call has
311    * no effect, and the original state is returned.
312    *
313    * @param hri the region info to create a state for
314    * @param newState the state to the region in set to
315    * @param serverName the server the region is transitioning on
316    * @param lastHost the last server that hosts the region
317    * @return the current state
318    */
319   public synchronized RegionState createRegionState(final HRegionInfo hri,
320       State newState, ServerName serverName, ServerName lastHost) {
321     if (newState == null || (newState == State.OPEN && serverName == null)) {
322       newState =  State.OFFLINE;
323     }
324     if (hri.isOffline() && hri.isSplit()) {
325       newState = State.SPLIT;
326       serverName = null;
327     }
328     String encodedName = hri.getEncodedName();
329     RegionState regionState = regionStates.get(encodedName);
330     if (regionState != null) {
331       LOG.warn("Tried to create a state for a region already in RegionStates, "
332         + "used existing: " + regionState + ", ignored new: " + newState);
333     } else {
334       regionState = new RegionState(hri, newState, serverName);
335       regionStates.put(encodedName, regionState);
336       if (newState == State.OPEN) {
337         if (!serverName.equals(lastHost)) {
338           LOG.warn("Open region's last host " + lastHost
339             + " should be the same as the current one " + serverName
340             + ", ignored the last and used the current one");
341           lastHost = serverName;
342         }
343         lastAssignments.put(encodedName, lastHost);
344         regionAssignments.put(hri, lastHost);
345       } else if (!regionState.isUnassignable()) {
346         regionsInTransition.put(encodedName, regionState);
347       }
348       if (lastHost != null && newState != State.SPLIT) {
349         addToServerHoldings(lastHost, hri);
350         if (newState != State.OPEN) {
351           oldAssignments.put(encodedName, lastHost);
352         }
353       }
354     }
355     return regionState;
356   }
357 
358   /**
359    * Update a region state. It will be put in transition if not already there.
360    */
361   public RegionState updateRegionState(
362       final HRegionInfo hri, final State state) {
363     RegionState regionState = getRegionState(hri.getEncodedName());
364     return updateRegionState(hri, state,
365       regionState == null ? null : regionState.getServerName());
366   }
367 
368   /**
369    * Update a region state. It will be put in transition if not already there.
370    *
371    * If we can't find the region info based on the region name in
372    * the transition, log a warning and return null.
373    */
374   public RegionState updateRegionState(
375       final RegionTransition transition, final State state) {
376     byte [] regionName = transition.getRegionName();
377     HRegionInfo regionInfo = getRegionInfo(regionName);
378     if (regionInfo == null) {
379       String prettyRegionName = HRegionInfo.prettyPrint(
380         HRegionInfo.encodeRegionName(regionName));
381       LOG.warn("Failed to find region " + prettyRegionName
382         + " in updating its state to " + state
383         + " based on region transition " + transition);
384       return null;
385     }
386     return updateRegionState(regionInfo, state,
387       transition.getServerName());
388   }
389 
390   /**
391    * Transition a region state to OPEN from OPENING/PENDING_OPEN
392    */
393   public synchronized RegionState transitionOpenFromPendingOpenOrOpeningOnServer(
394       final RegionTransition transition, final RegionState fromState, final ServerName sn) {
395     if(fromState.isPendingOpenOrOpeningOnServer(sn)){
396       return updateRegionState(transition, State.OPEN);
397     }
398     return null;
399   }
400 
401   /**
402    * Update a region state. It will be put in transition if not already there.
403    */
404   public RegionState updateRegionState(
405       final HRegionInfo hri, final State state, final ServerName serverName) {
406     return updateRegionState(hri, state, serverName, HConstants.NO_SEQNUM);
407   }
408 
409   public void regionOnline(
410       final HRegionInfo hri, final ServerName serverName) {
411     regionOnline(hri, serverName, HConstants.NO_SEQNUM);
412   }
413 
414   /**
415    * A region is online, won't be in transition any more.
416    * We can't confirm it is really online on specified region server
417    * because it hasn't been put in region server's online region list yet.
418    */
419   public void regionOnline(final HRegionInfo hri,
420       final ServerName serverName, long openSeqNum) {
421     String encodedName = hri.getEncodedName();
422     if (!serverManager.isServerOnline(serverName)) {
423       // This is possible if the region server dies before master gets a
424       // chance to handle ZK event in time. At this time, if the dead server
425       // is already processed by SSH, we should ignore this event.
426       // If not processed yet, ignore and let SSH deal with it.
427       LOG.warn("Ignored, " + encodedName
428         + " was opened on a dead server: " + serverName);
429       return;
430     }
431     updateRegionState(hri, State.OPEN, serverName, openSeqNum);
432 
433     synchronized (this) {
434       regionsInTransition.remove(encodedName);
435       ServerName oldServerName = regionAssignments.put(hri, serverName);
436       if (!serverName.equals(oldServerName)) {
437         LOG.info("Onlined " + hri.getShortNameToLog() + " on " + serverName);
438         addToServerHoldings(serverName, hri);
439         addToReplicaMapping(hri);
440         if (oldServerName == null) {
441           oldServerName = oldAssignments.remove(encodedName);
442         }
443         if (oldServerName != null
444             && !oldServerName.equals(serverName)
445             && serverHoldings.containsKey(oldServerName)) {
446           LOG.info("Offlined " + hri.getShortNameToLog() + " from " + oldServerName);
447           removeFromServerHoldings(oldServerName, hri);
448         }
449       }
450     }
451   }
452 
453   private void addToServerHoldings(ServerName serverName, HRegionInfo hri) {
454     Set<HRegionInfo> regions = serverHoldings.get(serverName);
455     if (regions == null) {
456       regions = new HashSet<HRegionInfo>();
457       serverHoldings.put(serverName, regions);
458     }
459     regions.add(hri);
460   }
461 
462   private void addToReplicaMapping(HRegionInfo hri) {
463     HRegionInfo defaultReplica = RegionReplicaUtil.getRegionInfoForDefaultReplica(hri);
464     Set<HRegionInfo> replicas =
465         defaultReplicaToOtherReplicas.get(defaultReplica);
466     if (replicas == null) {
467       replicas = new HashSet<HRegionInfo>();
468       defaultReplicaToOtherReplicas.put(defaultReplica, replicas);
469     }
470     replicas.add(hri);
471   }
472 
473   private void removeFromServerHoldings(ServerName serverName, HRegionInfo hri) {
474     Set<HRegionInfo> oldRegions = serverHoldings.get(serverName);
475     oldRegions.remove(hri);
476     if (oldRegions.isEmpty()) {
477       serverHoldings.remove(serverName);
478     }
479   }
480 
481   private void removeFromReplicaMapping(HRegionInfo hri) {
482     HRegionInfo defaultReplica = RegionReplicaUtil.getRegionInfoForDefaultReplica(hri);
483     Set<HRegionInfo> replicas = defaultReplicaToOtherReplicas.get(defaultReplica);
484     if (replicas != null) {
485       replicas.remove(hri);
486       if (replicas.isEmpty()) {
487         defaultReplicaToOtherReplicas.remove(defaultReplica);
488       }
489     }
490   }
491 
492   /**
493    * A dead server's wals have been split so that all the regions
494    * used to be open on it can be safely assigned now. Mark them assignable.
495    */
496   public synchronized void logSplit(final ServerName serverName) {
497     for (Iterator<Map.Entry<String, ServerName>> it
498         = lastAssignments.entrySet().iterator(); it.hasNext();) {
499       Map.Entry<String, ServerName> e = it.next();
500       if (e.getValue().equals(serverName)) {
501         it.remove();
502       }
503     }
504     long now = System.currentTimeMillis();
505     if (LOG.isDebugEnabled()) {
506       LOG.debug("Adding to processed servers " + serverName);
507     }
508     processedServers.put(serverName, Long.valueOf(now));
509     Configuration conf = server.getConfiguration();
510     long obsoleteTime = conf.getLong(LOG_SPLIT_TIME, DEFAULT_LOG_SPLIT_TIME);
511     // Doesn't have to be very accurate about the clean up time
512     if (now > lastProcessedServerCleanTime + obsoleteTime) {
513       lastProcessedServerCleanTime = now;
514       long cutoff = now - obsoleteTime;
515       for (Iterator<Map.Entry<ServerName, Long>> it
516           = processedServers.entrySet().iterator(); it.hasNext();) {
517         Map.Entry<ServerName, Long> e = it.next();
518         if (e.getValue().longValue() < cutoff) {
519           if (LOG.isDebugEnabled()) {
520             LOG.debug("Removed from processed servers " + e.getKey());
521           }
522           it.remove();
523         }
524       }
525     }
526   }
527 
528   /**
529    * Log split is done for a given region, so it is assignable now.
530    */
531   public void logSplit(final HRegionInfo region) {
532     clearLastAssignment(region);
533   }
534 
535   public synchronized void clearLastAssignment(final HRegionInfo region) {
536     lastAssignments.remove(region.getEncodedName());
537   }
538 
539   /**
540    * A region is offline, won't be in transition any more.
541    */
542   public void regionOffline(final HRegionInfo hri) {
543     regionOffline(hri, null);
544   }
545 
546   /**
547    * A region is offline, won't be in transition any more. Its state
548    * should be the specified expected state, which can only be
549    * Split/Merged/Offline/null(=Offline)/SplittingNew/MergingNew.
550    */
551   public void regionOffline(
552       final HRegionInfo hri, final State expectedState) {
553     Preconditions.checkArgument(expectedState == null
554       || RegionState.isUnassignable(expectedState),
555         "Offlined region should not be " + expectedState);
556     if (isRegionInState(hri, State.SPLITTING_NEW, State.MERGING_NEW)) {
557       // Remove it from all region maps
558       deleteRegion(hri);
559       return;
560     }
561     State newState =
562       expectedState == null ? State.OFFLINE : expectedState;
563     updateRegionState(hri, newState);
564     String encodedName = hri.getEncodedName();
565     synchronized (this) {
566       regionsInTransition.remove(encodedName);
567       ServerName oldServerName = regionAssignments.remove(hri);
568       if (oldServerName != null && serverHoldings.containsKey(oldServerName)) {
569         if (newState == State.MERGED || newState == State.SPLIT
570             || hri.isMetaRegion() || tableStateManager.isTableState(hri.getTable(),
571               ZooKeeperProtos.Table.State.DISABLED, ZooKeeperProtos.Table.State.DISABLING)) {
572           // Offline the region only if it's merged/split, or the table is disabled/disabling.
573           // Otherwise, offline it from this server only when it is online on a different server.
574           LOG.info("Offlined " + hri.getShortNameToLog() + " from " + oldServerName);
575           removeFromServerHoldings(oldServerName, hri);
576           removeFromReplicaMapping(hri);
577         } else {
578           // Need to remember it so that we can offline it from this
579           // server when it is online on a different server.
580           oldAssignments.put(encodedName, oldServerName);
581         }
582       }
583     }
584   }
585 
586   /**
587    * A server is offline, all regions on it are dead.
588    */
589   public List<HRegionInfo> serverOffline(final ZooKeeperWatcher watcher, final ServerName sn) {
590     // Offline all regions on this server not already in transition.
591     List<HRegionInfo> rits = new ArrayList<HRegionInfo>();
592     Set<HRegionInfo> regionsToCleanIfNoMetaEntry = new HashSet<HRegionInfo>();
593     // Offline regions outside the loop and synchronized block to avoid
594     // ConcurrentModificationException and deadlock in case of meta anassigned,
595     // but RegionState a blocked.
596     Set<HRegionInfo> regionsToOffline = new HashSet<HRegionInfo>();
597     synchronized (this) {
598       Set<HRegionInfo> assignedRegions = serverHoldings.get(sn);
599       if (assignedRegions == null) {
600         assignedRegions = new HashSet<HRegionInfo>();
601       }
602 
603       for (HRegionInfo region : assignedRegions) {
604         // Offline open regions, no need to offline if SPLIT/MERGED/OFFLINE
605         if (isRegionOnline(region)) {
606           regionsToOffline.add(region);
607         } else if (isRegionInState(region, State.SPLITTING, State.MERGING)) {
608           LOG.debug("Offline splitting/merging region " + getRegionState(region));
609           try {
610             // Delete the ZNode if exists
611             ZKAssign.deleteNodeFailSilent(watcher, region);
612             regionsToOffline.add(region);
613           } catch (KeeperException ke) {
614             server.abort("Unexpected ZK exception deleting node " + region, ke);
615           }
616         }
617       }
618 
619       for (RegionState state : regionsInTransition.values()) {
620         HRegionInfo hri = state.getRegion();
621         if (assignedRegions.contains(hri)) {
622           // Region is open on this region server, but in transition.
623           // This region must be moving away from this server, or splitting/merging.
624           // SSH will handle it, either skip assigning, or re-assign.
625           LOG.info("Transitioning " + state + " will be handled by SSH for " + sn);
626         } else if (sn.equals(state.getServerName())) {
627           // Region is in transition on this region server, and this
628           // region is not open on this server. So the region must be
629           // moving to this server from another one (i.e. opening or
630           // pending open on this server, was open on another one.
631           // Offline state is also kind of pending open if the region is in
632           // transition. The region could be in failed_close state too if we have
633           // tried several times to open it while this region server is not reachable)
634           if (state.isPendingOpenOrOpening() || state.isFailedClose() || state.isOffline()) {
635             LOG.info("Found region in " + state + " to be reassigned by SSH for " + sn);
636             rits.add(hri);
637           } else if(state.isSplittingNew()) {
638             regionsToCleanIfNoMetaEntry.add(state.getRegion());
639           } else {
640             LOG.warn("THIS SHOULD NOT HAPPEN: unexpected " + state);
641           }
642         }
643       }
644       this.notifyAll();
645     }
646 
647     for (HRegionInfo hri : regionsToOffline) {
648       regionOffline(hri);
649     }
650 
651     cleanIfNoMetaEntry(regionsToCleanIfNoMetaEntry);
652     return rits;
653   }
654 
655   /**
656    * This method does an RPC to hbase:meta. Do not call this method with a lock/synchronize held.
657    * @param hris The hris to check if empty in hbase:meta and if so, clean them up.
658    */
659   private void cleanIfNoMetaEntry(Set<HRegionInfo> hris) {
660     if (hris.isEmpty()) return;
661     for (HRegionInfo hri: hris) {
662       try {
663         // This is RPC to meta table. It is done while we have a synchronize on
664         // regionstates. No progress will be made if meta is not available at this time.
665         // This is a cleanup task. Not critical.
666         if (MetaTableAccessor.getRegion(server.getConnection(), hri.getEncodedNameAsBytes()) ==
667             null) {
668           regionOffline(hri);
669           FSUtils.deleteRegionDir(server.getConfiguration(), hri);
670         }
671       } catch (IOException e) {
672         LOG.warn("Got exception while deleting " + hri + " directories from file system.", e);
673       }
674     }
675   }
676 
677   /**
678    * Gets the online regions of the specified table.
679    * This method looks at the in-memory state.  It does not go to <code>hbase:meta</code>.
680    * Only returns <em>online</em> regions.  If a region on this table has been
681    * closed during a disable, etc., it will be included in the returned list.
682    * So, the returned list may not necessarily be ALL regions in this table, its
683    * all the ONLINE regions in the table.
684    * @param tableName
685    * @return Online regions from <code>tableName</code>
686    */
687   public synchronized List<HRegionInfo> getRegionsOfTable(TableName tableName) {
688     List<HRegionInfo> tableRegions = new ArrayList<HRegionInfo>();
689     // boundary needs to have table's name but regionID 0 so that it is sorted
690     // before all table's regions.
691     HRegionInfo boundary = new HRegionInfo(tableName, null, null, false, 0L);
692     for (HRegionInfo hri: regionAssignments.tailMap(boundary).keySet()) {
693       if(!hri.getTable().equals(tableName)) break;
694       tableRegions.add(hri);
695     }
696     return tableRegions;
697   }
698 
699 
700   /**
701    * Wait on region to clear regions-in-transition.
702    * <p>
703    * If the region isn't in transition, returns immediately.  Otherwise, method
704    * blocks until the region is out of transition.
705    */
706   public synchronized void waitOnRegionToClearRegionsInTransition(
707       final HRegionInfo hri) throws InterruptedException {
708     if (!isRegionInTransition(hri)) return;
709 
710     while(!server.isStopped() && isRegionInTransition(hri)) {
711       RegionState rs = getRegionState(hri);
712       LOG.info("Waiting on " + rs + " to clear regions-in-transition");
713       waitForUpdate(100);
714     }
715 
716     if (server.isStopped()) {
717       LOG.info("Giving up wait on region in " +
718         "transition because stoppable.isStopped is set");
719     }
720   }
721 
722   /**
723    * A table is deleted. Remove its regions from all internal maps.
724    * We loop through all regions assuming we don't delete tables too much.
725    */
726   public void tableDeleted(final TableName tableName) {
727     Set<HRegionInfo> regionsToDelete = new HashSet<HRegionInfo>();
728     synchronized (this) {
729       for (RegionState state: regionStates.values()) {
730         HRegionInfo region = state.getRegion();
731         if (region.getTable().equals(tableName)) {
732           regionsToDelete.add(region);
733         }
734       }
735     }
736     for (HRegionInfo region: regionsToDelete) {
737       deleteRegion(region);
738     }
739   }
740 
741   /**
742    * Get a copy of all regions assigned to a server
743    */
744   public synchronized Set<HRegionInfo> getServerRegions(ServerName serverName) {
745     Set<HRegionInfo> regions = serverHoldings.get(serverName);
746     if (regions == null) return null;
747     return new HashSet<HRegionInfo>(regions);
748   }
749 
750   /**
751    * Remove a region from all state maps.
752    */
753   @VisibleForTesting
754   public synchronized void deleteRegion(final HRegionInfo hri) {
755     String encodedName = hri.getEncodedName();
756     regionsInTransition.remove(encodedName);
757     regionStates.remove(encodedName);
758     lastAssignments.remove(encodedName);
759     ServerName sn = regionAssignments.remove(hri);
760     if (sn != null) {
761       Set<HRegionInfo> regions = serverHoldings.get(sn);
762       regions.remove(hri);
763     }
764   }
765 
766   /**
767    * Checking if a region was assigned to a server which is not online now.
768    * If so, we should hold re-assign this region till SSH has split its wals.
769    * Once logs are split, the last assignment of this region will be reset,
770    * which means a null last assignment server is ok for re-assigning.
771    *
772    * A region server could be dead but we don't know it yet. We may
773    * think it's online falsely. Therefore if a server is online, we still
774    * need to confirm it reachable and having the expected start code.
775    */
776   synchronized boolean wasRegionOnDeadServer(final String encodedName) {
777     ServerName server = lastAssignments.get(encodedName);
778     return isServerDeadAndNotProcessed(server);
779   }
780 
781   synchronized boolean isServerDeadAndNotProcessed(ServerName server) {
782     if (server == null) return false;
783     if (serverManager.isServerOnline(server)) {
784       String hostAndPort = server.getHostAndPort();
785       long startCode = server.getStartcode();
786       Long deadCode = deadServers.get(hostAndPort);
787       if (deadCode == null || startCode > deadCode.longValue()) {
788         if (serverManager.isServerReachable(server)) {
789           return false;
790         }
791         // The size of deadServers won't grow unbounded.
792         deadServers.put(hostAndPort, Long.valueOf(startCode));
793       }
794       // Watch out! If the server is not dead, the region could
795       // remain unassigned. That's why ServerManager#isServerReachable
796       // should use some retry.
797       //
798       // We cache this info since it is very unlikely for that
799       // instance to come back up later on. We don't want to expire
800       // the server since we prefer to let it die naturally.
801       LOG.warn("Couldn't reach online server " + server);
802     }
803     // Now, we know it's dead. Check if it's processed
804     return !processedServers.containsKey(server);
805   }
806 
807  /**
808    * Get the last region server a region was on for purpose of re-assignment,
809    * i.e. should the re-assignment be held back till log split is done?
810    */
811   synchronized ServerName getLastRegionServerOfRegion(final String encodedName) {
812     return lastAssignments.get(encodedName);
813   }
814 
815   synchronized void setLastRegionServerOfRegions(
816       final ServerName serverName, final List<HRegionInfo> regionInfos) {
817     for (HRegionInfo hri: regionInfos) {
818       setLastRegionServerOfRegion(serverName, hri.getEncodedName());
819     }
820   }
821 
822   synchronized void setLastRegionServerOfRegion(
823       final ServerName serverName, final String encodedName) {
824     lastAssignments.put(encodedName, serverName);
825   }
826 
827   void splitRegion(HRegionInfo p,
828       HRegionInfo a, HRegionInfo b, ServerName sn) throws IOException {
829     regionStateStore.splitRegion(p, a, b, sn);
830     synchronized (this) {
831       // After PONR, split is considered to be done.
832       // Update server holdings to be aligned with the meta.
833       Set<HRegionInfo> regions = serverHoldings.get(sn);
834       if (regions == null) {
835         throw new IllegalStateException(sn + " should host some regions");
836       }
837       regions.remove(p);
838       regions.add(a);
839       regions.add(b);
840     }
841   }
842 
843   void mergeRegions(HRegionInfo p,
844       HRegionInfo a, HRegionInfo b, ServerName sn) throws IOException {
845     regionStateStore.mergeRegions(p, a, b, sn);
846     synchronized (this) {
847       // After PONR, merge is considered to be done.
848       // Update server holdings to be aligned with the meta.
849       Set<HRegionInfo> regions = serverHoldings.get(sn);
850       if (regions == null) {
851         throw new IllegalStateException(sn + " should host some regions");
852       }
853       regions.remove(a);
854       regions.remove(b);
855       regions.add(p);
856     }
857   }
858 
859   /**
860    * At cluster clean re/start, mark all user regions closed except those of tables
861    * that are excluded, such as disabled/disabling/enabling tables. All user regions
862    * and their previous locations are returned.
863    */
864   synchronized Map<HRegionInfo, ServerName> closeAllUserRegions(Set<TableName> excludedTables) {
865     boolean noExcludeTables = excludedTables == null || excludedTables.isEmpty();
866     Set<HRegionInfo> toBeClosed = new HashSet<HRegionInfo>(regionStates.size());
867     for(RegionState state: regionStates.values()) {
868       HRegionInfo hri = state.getRegion();
869       if (state.isSplit() || hri.isSplit()) {
870         continue;
871       }
872       TableName tableName = hri.getTable();
873       if (!TableName.META_TABLE_NAME.equals(tableName)
874           && (noExcludeTables || !excludedTables.contains(tableName))) {
875         toBeClosed.add(hri);
876       }
877     }
878     Map<HRegionInfo, ServerName> allUserRegions =
879       new HashMap<HRegionInfo, ServerName>(toBeClosed.size());
880     for (HRegionInfo hri: toBeClosed) {
881       RegionState regionState = updateRegionState(hri, State.CLOSED);
882       allUserRegions.put(hri, regionState.getServerName());
883     }
884     return allUserRegions;
885   }
886 
887   /**
888    * Compute the average load across all region servers.
889    * Currently, this uses a very naive computation - just uses the number of
890    * regions being served, ignoring stats about number of requests.
891    * @return the average load
892    */
893   protected synchronized double getAverageLoad() {
894     int numServers = 0, totalLoad = 0;
895     for (Map.Entry<ServerName, Set<HRegionInfo>> e: serverHoldings.entrySet()) {
896       Set<HRegionInfo> regions = e.getValue();
897       ServerName serverName = e.getKey();
898       int regionCount = regions.size();
899       if (serverManager.isServerOnline(serverName)) {
900         totalLoad += regionCount;
901         numServers++;
902       }
903     }
904     if (numServers > 1) {
905       // The master region server holds only a couple regions.
906       // Don't consider this server in calculating the average load
907       // if there are other region servers to avoid possible confusion.
908       Set<HRegionInfo> hris = serverHoldings.get(server.getServerName());
909       if (hris != null) {
910         totalLoad -= hris.size();
911         numServers--;
912       }
913     }
914     return numServers == 0 ? 0.0 :
915       (double)totalLoad / (double)numServers;
916   }
917 
918   /**
919    * This is an EXPENSIVE clone.  Cloning though is the safest thing to do.
920    * Can't let out original since it can change and at least the load balancer
921    * wants to iterate this exported list.  We need to synchronize on regions
922    * since all access to this.servers is under a lock on this.regions.
923    *
924    * @return A clone of current assignments by table.
925    */
926   protected Map<TableName, Map<ServerName, List<HRegionInfo>>>
927       getAssignmentsByTable() {
928     Map<TableName, Map<ServerName, List<HRegionInfo>>> result =
929       new HashMap<TableName, Map<ServerName,List<HRegionInfo>>>();
930     synchronized (this) {
931       if (!server.getConfiguration().getBoolean("hbase.master.loadbalance.bytable", false)) {
932         Map<ServerName, List<HRegionInfo>> svrToRegions =
933           new HashMap<ServerName, List<HRegionInfo>>(serverHoldings.size());
934         for (Map.Entry<ServerName, Set<HRegionInfo>> e: serverHoldings.entrySet()) {
935           svrToRegions.put(e.getKey(), new ArrayList<HRegionInfo>(e.getValue()));
936         }
937         result.put(TableName.valueOf("ensemble"), svrToRegions);
938       } else {
939         for (Map.Entry<ServerName, Set<HRegionInfo>> e: serverHoldings.entrySet()) {
940           for (HRegionInfo hri: e.getValue()) {
941             if (hri.isMetaRegion()) continue;
942             TableName tablename = hri.getTable();
943             Map<ServerName, List<HRegionInfo>> svrToRegions = result.get(tablename);
944             if (svrToRegions == null) {
945               svrToRegions = new HashMap<ServerName, List<HRegionInfo>>(serverHoldings.size());
946               result.put(tablename, svrToRegions);
947             }
948             List<HRegionInfo> regions = svrToRegions.get(e.getKey());
949             if (regions == null) {
950               regions = new ArrayList<HRegionInfo>();
951               svrToRegions.put(e.getKey(), regions);
952             }
953             regions.add(hri);
954           }
955         }
956       }
957     }
958 
959     Map<ServerName, ServerLoad>
960       onlineSvrs = serverManager.getOnlineServers();
961     // Take care of servers w/o assignments, and remove servers in draining mode
962     List<ServerName> drainingServers = this.serverManager.getDrainingServersList();
963     for (Map<ServerName, List<HRegionInfo>> map: result.values()) {
964       for (ServerName svr: onlineSvrs.keySet()) {
965         if (!map.containsKey(svr)) {
966           map.put(svr, new ArrayList<HRegionInfo>());
967         }
968       }
969       map.keySet().removeAll(drainingServers);
970     }
971     return result;
972   }
973 
974   protected RegionState getRegionState(final HRegionInfo hri) {
975     return getRegionState(hri.getEncodedName());
976   }
977 
978   /**
979    * Returns a clone of region assignments per server
980    * @return a Map of ServerName to a List of HRegionInfo's
981    */
982   protected synchronized Map<ServerName, List<HRegionInfo>> getRegionAssignmentsByServer() {
983     Map<ServerName, List<HRegionInfo>> regionsByServer =
984         new HashMap<ServerName, List<HRegionInfo>>(serverHoldings.size());
985     for (Map.Entry<ServerName, Set<HRegionInfo>> e: serverHoldings.entrySet()) {
986       regionsByServer.put(e.getKey(), new ArrayList<HRegionInfo>(e.getValue()));
987     }
988     return regionsByServer;
989   }
990 
991   protected synchronized RegionState getRegionState(final String encodedName) {
992     return regionStates.get(encodedName);
993   }
994 
995   /**
996    * Get the HRegionInfo from cache, if not there, from the hbase:meta table
997    * @param  regionName
998    * @return HRegionInfo for the region
999    */
1000   @SuppressWarnings("deprecation")
1001   protected HRegionInfo getRegionInfo(final byte [] regionName) {
1002     String encodedName = HRegionInfo.encodeRegionName(regionName);
1003     RegionState regionState = getRegionState(encodedName);
1004     if (regionState != null) {
1005       return regionState.getRegion();
1006     }
1007 
1008     try {
1009       Pair<HRegionInfo, ServerName> p =
1010         MetaTableAccessor.getRegion(server.getConnection(), regionName);
1011       HRegionInfo hri = p == null ? null : p.getFirst();
1012       if (hri != null) {
1013         createRegionState(hri);
1014       }
1015       return hri;
1016     } catch (IOException e) {
1017       server.abort("Aborting because error occoured while reading "
1018         + Bytes.toStringBinary(regionName) + " from hbase:meta", e);
1019       return null;
1020     }
1021   }
1022 
1023   static boolean isOneOfStates(RegionState regionState, State... states) {
1024     State s = regionState != null ? regionState.getState() : null;
1025     for (State state: states) {
1026       if (s == state) return true;
1027     }
1028     return false;
1029   }
1030 
1031   /**
1032    * Update a region state. It will be put in transition if not already there.
1033    */
1034   private RegionState updateRegionState(final HRegionInfo hri,
1035       final State state, final ServerName serverName, long openSeqNum) {
1036     if (state == State.FAILED_CLOSE || state == State.FAILED_OPEN) {
1037       LOG.warn("Failed to open/close " + hri.getShortNameToLog()
1038         + " on " + serverName + ", set to " + state);
1039     }
1040 
1041     String encodedName = hri.getEncodedName();
1042     RegionState regionState = new RegionState(
1043       hri, state, System.currentTimeMillis(), serverName);
1044     RegionState oldState = getRegionState(encodedName);
1045     if (!regionState.equals(oldState)) {
1046       LOG.info("Transition " + oldState + " to " + regionState);
1047       // Persist region state before updating in-memory info, if needed
1048       regionStateStore.updateRegionState(openSeqNum, regionState, oldState);
1049     }
1050 
1051     synchronized (this) {
1052       regionsInTransition.put(encodedName, regionState);
1053       regionStates.put(encodedName, regionState);
1054 
1055       // For these states, region should be properly closed.
1056       // There should be no log splitting issue.
1057       if ((state == State.CLOSED || state == State.MERGED
1058           || state == State.SPLIT) && lastAssignments.containsKey(encodedName)) {
1059         ServerName last = lastAssignments.get(encodedName);
1060         if (last.equals(serverName)) {
1061           lastAssignments.remove(encodedName);
1062         } else {
1063           LOG.warn(encodedName + " moved to " + state + " on "
1064             + serverName + ", expected " + last);
1065         }
1066       }
1067 
1068       // Once a region is opened, record its last assignment right away.
1069       if (serverName != null && state == State.OPEN) {
1070         ServerName last = lastAssignments.get(encodedName);
1071         if (!serverName.equals(last)) {
1072           lastAssignments.put(encodedName, serverName);
1073           if (last != null && isServerDeadAndNotProcessed(last)) {
1074             LOG.warn(encodedName + " moved to " + serverName
1075               + ", while it's previous host " + last
1076               + " is dead but not processed yet");
1077           }
1078         }
1079       }
1080 
1081       // notify the change
1082       this.notifyAll();
1083     }
1084     return regionState;
1085   }
1086 }